From 775fed8172d7e3bbc64c1c69af8be416ff92856e Mon Sep 17 00:00:00 2001
From: Wang Huan <wanghuan29@baidu.com>
Date: Thu, 11 Apr 2024 08:43:00 +0000
Subject: [PATCH 1/3] use tensor.shape bug not paddle.shape(tensor)

---
 .../language_model/moe/dygraph/modeling.py    |  4 +-
 .../task/senti/rnn/model.py                   | 12 ++--
 .../stacl/demo/model_demo.py                  |  2 +-
 .../simultaneous_translation/stacl/model.py   | 10 ++--
 examples/text_classification/rnn/model.py     | 10 ++--
 .../RAT-SQL/text2sql/utils/nn_utils.py        |  2 +-
 llm/ernie-3.5-se/modeling.py                  |  4 +-
 .../language_model/gpt/auto/auto_model.py     | 16 ++---
 .../gpt/dygraph/hybrid_model.py               |  8 +--
 .../gpt/dygraph/single_model.py               | 12 ++--
 .../transformers/bloom/modeling.py            |  2 +-
 .../transformers/chatglm_v2/modeling.py       |  2 +-
 .../experimental/transformers/gpt/modeling.py |  2 +-
 .../transformers/llama/modeling.py            |  2 +-
 .../experimental/transformers/opt/modeling.py |  2 +-
 .../transformers/qwen/modeling.py             |  2 +-
 paddlenlp/generation/utils.py                 | 14 ++---
 paddlenlp/layers/crf.py                       |  8 +--
 paddlenlp/layers/globalpointer.py             |  2 +-
 .../fast_transformer/transformer/decoder.py   |  6 +-
 .../fast_transformer/transformer/decoding.py  | 14 ++---
 .../transformer/fast_transformer.py           |  6 +-
 paddlenlp/prompt/prompt_utils.py              |  4 +-
 paddlenlp/trainer/trainer_compress.py         |  4 +-
 paddlenlp/transformers/albert/modeling.py     |  2 +-
 paddlenlp/transformers/bart/modeling.py       | 14 ++---
 paddlenlp/transformers/bert/modeling.py       |  2 +-
 paddlenlp/transformers/bigbird/modeling.py    |  8 +--
 paddlenlp/transformers/blenderbot/modeling.py |  2 +-
 .../transformers/blenderbot_small/modeling.py |  2 +-
 paddlenlp/transformers/bloom/modeling.py      | 12 ++--
 paddlenlp/transformers/convbert/modeling.py   |  8 +--
 paddlenlp/transformers/dallebart/modeling.py  |  6 +-
 paddlenlp/transformers/deberta/modeling.py    |  6 +-
 paddlenlp/transformers/deberta_v2/modeling.py |  6 +-
 paddlenlp/transformers/electra/modeling.py    |  2 +-
 paddlenlp/transformers/ernie/modeling.py      |  8 +--
 paddlenlp/transformers/ernie_code/modeling.py | 20 +++----
 paddlenlp/transformers/ernie_ctm/modeling.py  |  2 +-
 paddlenlp/transformers/ernie_gen/modeling.py  |  2 +-
 paddlenlp/transformers/ernie_gram/modeling.py |  4 +-
 .../transformers/ernie_layout/modeling.py     | 20 +++----
 paddlenlp/transformers/ernie_m/modeling.py    |  4 +-
 paddlenlp/transformers/gau_alpha/modeling.py  | 12 ++--
 paddlenlp/transformers/gpt/modeling.py        | 18 +++---
 paddlenlp/transformers/gpt/modeling_auto.py   | 10 ++--
 paddlenlp/transformers/layoutlmv2/modeling.py |  6 +-
 paddlenlp/transformers/layoutxlm/modeling.py  | 16 ++---
 paddlenlp/transformers/llama/modeling.py      |  2 +-
 paddlenlp/transformers/llama/modeling_auto.py |  2 +-
 .../llama/modeling_auto_static.py             |  2 +-
 paddlenlp/transformers/mbart/modeling.py      | 16 ++---
 paddlenlp/transformers/minigpt4/modeling.py   |  2 +-
 paddlenlp/transformers/mixtral/modeling.py    |  2 +-
 paddlenlp/transformers/mobilebert/modeling.py |  4 +-
 paddlenlp/transformers/mt5/modeling.py        | 20 +++----
 paddlenlp/transformers/nezha/modeling.py      |  4 +-
 paddlenlp/transformers/opt/modeling.py        |  6 +-
 paddlenlp/transformers/pegasus/modeling.py    | 10 ++--
 paddlenlp/transformers/reformer/modeling.py   |  4 +-
 paddlenlp/transformers/roberta/modeling.py    | 10 ++--
 paddlenlp/transformers/roformer/modeling.py   |  6 +-
 paddlenlp/transformers/roformerv2/modeling.py | 10 ++--
 paddlenlp/transformers/skep/modeling.py       |  8 +--
 paddlenlp/transformers/t5/modeling.py         | 20 +++----
 paddlenlp/transformers/tinybert/modeling.py   |  2 +-
 .../transformers/transformer/modeling.py      | 24 ++++----
 .../unified_transformer/modeling.py           | 11 ++--
 paddlenlp/transformers/unimo/modeling.py      | 11 ++--
 paddlenlp/transformers/xlm/modeling.py        | 22 ++++---
 paddlenlp/transformers/xlnet/modeling.py      | 60 +++++++------------
 .../modules/bert_for_question_answering.py    |  2 +-
 tests/test_tipc/transformer/modeling.py       | 14 ++---
 tests/transformer/modeling.py                 | 14 ++---
 74 files changed, 298 insertions(+), 332 deletions(-)

diff --git a/examples/language_model/moe/dygraph/modeling.py b/examples/language_model/moe/dygraph/modeling.py
index b45c9a465e70..17a77245ee1e 100644
--- a/examples/language_model/moe/dygraph/modeling.py
+++ b/examples/language_model/moe/dygraph/modeling.py
@@ -748,8 +748,8 @@ def forward(self, input_ids, position_ids=None, attention_mask=None, use_cache=F
         if position_ids is None:
             past_length = 0
             if cache is not None:
-                past_length = paddle.shape(cache[0].k)[-2]
-            position_ids = paddle.arange(past_length, paddle.shape(input_ids)[-1] + past_length, dtype="int64")
+                past_length = cache[0].k.shape[-2]
+            position_ids = paddle.arange(past_length, input_ids.shape[-1] + past_length, dtype="int64")
             position_ids = position_ids.unsqueeze(0)
             # .expand_as(input_ids)
             position_ids = paddle.expand_as(position_ids, input_ids)
diff --git a/examples/model_interpretation/task/senti/rnn/model.py b/examples/model_interpretation/task/senti/rnn/model.py
index 9c509e72432e..247a5f65bc5e 100644
--- a/examples/model_interpretation/task/senti/rnn/model.py
+++ b/examples/model_interpretation/task/senti/rnn/model.py
@@ -207,7 +207,7 @@ def forward(self, input, mask=None):
         # Shape: (batch_size, max_seq_len, hidden_size)
         h = paddle.add_n([forward_input, backward_input])
         # Shape: (batch_size, hidden_size, 1)
-        att_weight = self.att_weight.tile(repeat_times=(paddle.shape(h)[0], 1, 1))
+        att_weight = self.att_weight.tile(repeat_times=(h.shape[0], 1, 1))
         # Shape: (batch_size, max_seq_len, 1)
         att_score = paddle.bmm(paddle.tanh(h), att_weight)
         if mask is not None:
@@ -246,20 +246,18 @@ def forward(self, input, mask=None):
                 Tensor is a bool tensor, whose each element identifies whether the input word id is pad token or not.
                 Defaults to `None
         """
-        weight = self.input_weight.tile(
-            repeat_times=(paddle.shape(input)[0], 1, 1)
-        )  # tensor[batch, hidden_size, hidden_size]
-        bias = self.bias.tile(repeat_times=(paddle.shape(input)[0], 1, 1))  # tensor[batch, 1, hidden_size]
+        weight = self.input_weight.tile(repeat_times=(input.shape[0], 1, 1))  # tensor[batch, hidden_size, hidden_size]
+        bias = self.bias.tile(repeat_times=(input.shape[0], 1, 1))  # tensor[batch, 1, hidden_size]
         word_squish = paddle.bmm(input, weight) + bias  # Shape: (batch_size, seq_len, hidden_size)
         att_context_vector = self.att_context_vector.tile(
-            repeat_times=(paddle.shape(input)[0], 1, 1)
+            repeat_times=(input.shape[0], 1, 1)
         )  # Shape: (batch_size, hidden_size, 1)
         att_score = paddle.bmm(word_squish, att_context_vector)  # tensor[batch_size, seq_len, 1]
         if mask is not None:
             # mask, remove the effect of 'PAD'
             mask = paddle.cast(mask, dtype="float32")
             mask = mask.unsqueeze(axis=-1)
-            inf_tensor = paddle.full(shape=paddle.shape(mask), dtype="float32", fill_value=-INF)
+            inf_tensor = paddle.full(shape=mask.shape, dtype="float32", fill_value=-INF)
             att_score = paddle.multiply(att_score, mask) + paddle.multiply(inf_tensor, (1 - mask))
         att_weight = F.softmax(att_score, axis=1)  # tensor[batch_size, seq_len, 1]
 
diff --git a/examples/simultaneous_translation/stacl/demo/model_demo.py b/examples/simultaneous_translation/stacl/demo/model_demo.py
index 6f7a2cc7dfb4..7f73dfe06a10 100644
--- a/examples/simultaneous_translation/stacl/demo/model_demo.py
+++ b/examples/simultaneous_translation/stacl/demo/model_demo.py
@@ -34,7 +34,7 @@ def greedy_search(self, src_word, max_len=256, waitk=-1, caches=None, bos_id=Non
         So, it needsprevious state(caches) and last one of generated
         tokens id last time.
         """
-        src_max_len = paddle.shape(src_word)[-1]
+        src_max_len = src_word.shape[-1]
         base_attn_bias = (
             paddle.cast(src_word == self.bos_id, dtype=paddle.get_default_dtype()).unsqueeze([1, 2]) * -1e9
         )
diff --git a/examples/simultaneous_translation/stacl/model.py b/examples/simultaneous_translation/stacl/model.py
index e987178dd87e..185156a89908 100644
--- a/examples/simultaneous_translation/stacl/model.py
+++ b/examples/simultaneous_translation/stacl/model.py
@@ -15,11 +15,11 @@
 from __future__ import print_function
 
 import numpy as np
-
 import paddle
 import paddle.nn as nn
 import paddle.nn.functional as F
-from paddlenlp.transformers import WordEmbedding, PositionalEmbedding
+
+from paddlenlp.transformers import PositionalEmbedding, WordEmbedding
 
 
 class CrossEntropyCriterion(nn.Layer):
@@ -190,8 +190,8 @@ def __init__(
             self.linear = nn.Linear(in_features=d_model, out_features=trg_vocab_size, bias_attr=False)
 
     def forward(self, src_word, trg_word):
-        src_max_len = paddle.shape(src_word)[-1]
-        trg_max_len = paddle.shape(trg_word)[-1]
+        src_max_len = src_word.shape[-1]
+        trg_max_len = trg_word.shape[-1]
         base_attn_bias = (
             paddle.cast(src_word == self.bos_id, dtype=paddle.get_default_dtype()).unsqueeze([1, 2]) * -1e9
         )
@@ -236,7 +236,7 @@ def beam_search(self, src_word, beam_size=4, max_len=256, waitk=-1):
         raise NotImplementedError
 
     def greedy_search(self, src_word, max_len=256, waitk=-1):
-        src_max_len = paddle.shape(src_word)[-1]
+        src_max_len = src_word.shape[-1]
         base_attn_bias = (
             paddle.cast(src_word == self.bos_id, dtype=paddle.get_default_dtype()).unsqueeze([1, 2]) * -1e9
         )
diff --git a/examples/text_classification/rnn/model.py b/examples/text_classification/rnn/model.py
index 7d2e4950db0b..04ce46cd62b5 100644
--- a/examples/text_classification/rnn/model.py
+++ b/examples/text_classification/rnn/model.py
@@ -253,7 +253,7 @@ def forward(self, input, mask=None):
         # Shape: (batch_size, max_seq_len, hidden_size)
         h = paddle.add_n([forward_input, backward_input])
         # Shape: (batch_size, hidden_size, 1)
-        att_weight = self.att_weight.tile(repeat_times=(paddle.shape(h)[0], 1, 1))
+        att_weight = self.att_weight.tile(repeat_times=(h.shape[0], 1, 1))
         # Shape: (batch_size, max_seq_len, 1)
         att_score = paddle.bmm(paddle.tanh(h), att_weight)
         if mask is not None:
@@ -292,19 +292,19 @@ def forward(self, input, mask=None):
                 Tensor is a bool tensor, whose each element identifies whether the input word id is pad token or not.
                 Defaults to `None
         """
-        weight = self.input_weight.tile(repeat_times=(paddle.shape(input)[0], 1, 1))
-        bias = self.bias.tile(repeat_times=(paddle.shape(input)[0], 1, 1))
+        weight = self.input_weight.tile(repeat_times=(input.shape[0], 1, 1))
+        bias = self.bias.tile(repeat_times=(input.shape[0], 1, 1))
         # Shape: (batch_size, max_seq_len, hidden_size)
         word_squish = paddle.bmm(input, weight) + bias
 
-        att_context_vector = self.att_context_vector.tile(repeat_times=(paddle.shape(input)[0], 1, 1))
+        att_context_vector = self.att_context_vector.tile(repeat_times=(input.shape[0], 1, 1))
         # Shape: (batch_size, max_seq_len, 1)
         att_score = paddle.bmm(word_squish, att_context_vector)
         if mask is not None:
             # mask, remove the effect of 'PAD'
             mask = paddle.cast(mask, dtype="float32")
             mask = mask.unsqueeze(axis=-1)
-            inf_tensor = paddle.full(shape=paddle.shape(mask), dtype="float32", fill_value=-INF)
+            inf_tensor = paddle.full(shape=mask.shape, dtype="float32", fill_value=-INF)
             att_score = paddle.multiply(att_score, mask) + paddle.multiply(inf_tensor, (1 - mask))
         att_weight = F.softmax(att_score, axis=1)
 
diff --git a/examples/text_to_sql/RAT-SQL/text2sql/utils/nn_utils.py b/examples/text_to_sql/RAT-SQL/text2sql/utils/nn_utils.py
index 02d04743d52c..fe13c7d489b7 100644
--- a/examples/text_to_sql/RAT-SQL/text2sql/utils/nn_utils.py
+++ b/examples/text_to_sql/RAT-SQL/text2sql/utils/nn_utils.py
@@ -74,7 +74,7 @@ def batch_gather_2d(var, indices):
             "shape of indices error. it should be a 2-D layers. " "but got shape = %s" % (str(indices.shape),)
         )
 
-    batch_size = paddle.shape(indices)[0]
+    batch_size = indices.shape[0]
 
     zero = paddle.to_tensor([0], dtype="int64")
     one = paddle.to_tensor([1], dtype="int64")
diff --git a/llm/ernie-3.5-se/modeling.py b/llm/ernie-3.5-se/modeling.py
index 570433b994c9..9e1165e71a65 100644
--- a/llm/ernie-3.5-se/modeling.py
+++ b/llm/ernie-3.5-se/modeling.py
@@ -142,7 +142,7 @@ def scaled_dot_product_attention(
     query_states, key_states, value_states, attention_mask, output_attentions, config, is_causal=True
 ):
 
-    bsz, q_len, num_heads, _ = paddle.shape(query_states)
+    bsz, q_len, num_heads, _ = query_states.shape
     head_dim = config.hidden_size // config.num_attention_heads
     _, kv_seq_len, _, _ = value_states.shape
 
@@ -1054,7 +1054,7 @@ def forward(
         seq_length_with_past = seq_length
         cache_length = 0
         if past_key_values[0] is not None:
-            cache_length = paddle.shape(past_key_values[0][0])[1]
+            cache_length = past_key_values[0][0].shape[1]
             seq_length_with_past += cache_length
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids).astype(self.embed_tokens.weight.dtype)
diff --git a/model_zoo/gpt-3/ppfleetx/models/language_model/gpt/auto/auto_model.py b/model_zoo/gpt-3/ppfleetx/models/language_model/gpt/auto/auto_model.py
index a283bb7b46fe..45f8ed4e556d 100644
--- a/model_zoo/gpt-3/ppfleetx/models/language_model/gpt/auto/auto_model.py
+++ b/model_zoo/gpt-3/ppfleetx/models/language_model/gpt/auto/auto_model.py
@@ -735,8 +735,8 @@ def forward(self, input_ids, position_ids=None, attention_mask=None, use_cache=F
         if position_ids is None:
             past_length = 0
             if cache is not None:
-                past_length = paddle.shape(attention_mask)[-1] - 1
-            position_ids = paddle.arange(past_length, paddle.shape(input_ids)[-1] + past_length, dtype=input_ids.dtype)
+                past_length = attention_mask.shape[-1] - 1
+            position_ids = paddle.arange(past_length, input_ids.shape[-1] + past_length, dtype=input_ids.dtype)
             position_ids = position_ids.unsqueeze(0)
             position_ids = paddle.expand_as(position_ids, input_ids)
 
@@ -753,7 +753,7 @@ def forward(self, input_ids, position_ids=None, attention_mask=None, use_cache=F
         if not self.fused_softmax_with_triangular or not paddle.is_compiled_with_cuda():
             # TODO, use registered buffer
             causal_mask = paddle.tensor.triu(
-                paddle.ones((paddle.shape(input_ids)[-1], paddle.shape(input_ids)[-1])) * -1e4, diagonal=1
+                paddle.ones((input_ids.shape[-1], input_ids.shape[-1])) * -1e4, diagonal=1
             )
             if attention_mask is not None:
                 if len(attention_mask.shape) == 2:
@@ -972,7 +972,7 @@ def get_logits_processor(
 
     def expand_inputs_for_generation(self, input_ids, expand_size, attention_mask=None, **model_kwargs):
 
-        index = paddle.tile(paddle.arange(paddle.shape(input_ids)[0]).unsqueeze(-1), [1, expand_size]).reshape([-1])
+        index = paddle.tile(paddle.arange(input_ids.shape[0]).unsqueeze(-1), [1, expand_size]).reshape([-1])
 
         input_ids = paddle.gather(input_ids, index)
 
@@ -1109,11 +1109,11 @@ def TopPProcess(probs, top_p, min_tokens_to_keep):
             probs = paddle.where(condition, paddle.full_like(probs, 0.0), probs)
             return probs
 
-        batch_size, cur_len = paddle.shape(input_ids)
+        batch_size, cur_len = input_ids.shape
         # used for compute on gpu, avoid memcpy D2H
         cur_len_gpu = paddle.full([1], cur_len, dtype="int64")
 
-        origin_len = paddle.shape(input_ids)[1]
+        origin_len = input_ids.shape[1]
         # used for compute on gpu, avoid memcpy D2H
         origin_len_gpu = paddle.full([1], origin_len, dtype="int64")
 
@@ -1167,7 +1167,7 @@ def _post_process_(outputs, input_ids, cur_len, origin_len, scores, unfinished_f
                         raise ImportError(
                             "please install ppfleetx_ops by 'cd ppfleetx/ops && python setup_cuda.py install'!"
                         )
-                    top_ps_tensor = paddle.full(shape=[paddle.shape(probs)[0]], fill_value=top_p, dtype=probs.dtype)
+                    top_ps_tensor = paddle.full(shape=[probs.shape[0]], fill_value=top_p, dtype=probs.dtype)
                     # TODO fake random seed here
                     # Users should set the random seed dynamically when inference
                     _, next_tokens = topp_sampling(probs, top_ps_tensor, random_seed=100)
@@ -1299,7 +1299,7 @@ def forward(self, input_ids=None, **model_kwargs):
 
         if model_kwargs.get("position_ids", None) is None:
             model_kwargs["position_ids"] = paddle.arange(
-                0, paddle.shape(model_kwargs["attention_mask"])[-1], dtype=input_ids.dtype
+                0, model_kwargs["attention_mask"].shape[-1], dtype=input_ids.dtype
             ).unsqueeze(0)
 
         self.is_encoder_decoder = False
diff --git a/model_zoo/gpt-3/ppfleetx/models/language_model/gpt/dygraph/hybrid_model.py b/model_zoo/gpt-3/ppfleetx/models/language_model/gpt/dygraph/hybrid_model.py
index f47d800c5f15..38380d82f93b 100644
--- a/model_zoo/gpt-3/ppfleetx/models/language_model/gpt/dygraph/hybrid_model.py
+++ b/model_zoo/gpt-3/ppfleetx/models/language_model/gpt/dygraph/hybrid_model.py
@@ -834,8 +834,8 @@ def forward(self, input_ids, position_ids=None, attention_mask=None, use_cache=F
         if position_ids is None:
             past_length = 0
             if cache is not None:
-                past_length = paddle.shape(attention_mask)[-1] - 1
-            position_ids = paddle.arange(past_length, paddle.shape(input_ids)[-1] + past_length, dtype=input_ids.dtype)
+                past_length = attention_mask.shape[-1] - 1
+            position_ids = paddle.arange(past_length, input_ids.shape[-1] + past_length, dtype=input_ids.dtype)
             position_ids = position_ids.unsqueeze(0)
             # .expand_as(input_ids)
             position_ids = paddle.expand_as(position_ids, input_ids)
@@ -848,7 +848,7 @@ def forward(self, input_ids, position_ids=None, attention_mask=None, use_cache=F
         if not self.fused_softmax_with_triangular or not paddle.is_compiled_with_cuda():
             # TODO, use registered buffer
             causal_mask = paddle.tensor.triu(
-                paddle.ones((paddle.shape(input_ids)[-1], paddle.shape(input_ids)[-1])) * -1e4, diagonal=1
+                paddle.ones((input_ids.shape[-1], input_ids.shape[-1])) * -1e4, diagonal=1
             )
             if attention_mask is not None:
                 if len(attention_mask.shape) == 2:
@@ -1301,7 +1301,7 @@ def get_logits_processor(
 
     def expand_inputs_for_generation(self, input_ids, expand_size, attention_mask=None, **model_kwargs):
 
-        index = paddle.tile(paddle.arange(paddle.shape(input_ids)[0]).unsqueeze(-1), [1, expand_size]).reshape([-1])
+        index = paddle.tile(paddle.arange(input_ids.shape[0]).unsqueeze(-1), [1, expand_size]).reshape([-1])
 
         input_ids = paddle.gather(input_ids, index)
 
diff --git a/model_zoo/gpt-3/ppfleetx/models/language_model/gpt/dygraph/single_model.py b/model_zoo/gpt-3/ppfleetx/models/language_model/gpt/dygraph/single_model.py
index 80ca22b855ca..ccbe318790c2 100644
--- a/model_zoo/gpt-3/ppfleetx/models/language_model/gpt/dygraph/single_model.py
+++ b/model_zoo/gpt-3/ppfleetx/models/language_model/gpt/dygraph/single_model.py
@@ -602,8 +602,8 @@ def forward(self, input_ids, position_ids=None, attention_mask=None, use_cache=F
         if position_ids is None:
             past_length = 0
             if cache is not None:
-                past_length = paddle.shape(attention_mask)[-1] - 1
-            position_ids = paddle.arange(past_length, paddle.shape(input_ids)[-1] + past_length, dtype=input_ids.dtype)
+                past_length = attention_mask.shape[-1] - 1
+            position_ids = paddle.arange(past_length, input_ids.shape[-1] + past_length, dtype=input_ids.dtype)
             position_ids = position_ids.unsqueeze(0)
             # .expand_as(input_ids)
             position_ids = paddle.expand_as(position_ids, input_ids)
@@ -615,7 +615,7 @@ def forward(self, input_ids, position_ids=None, attention_mask=None, use_cache=F
         if not self.fused_softmax_with_triangular or not paddle.is_compiled_with_cuda():
             # TODO, use registered buffer
             causal_mask = paddle.tensor.triu(
-                paddle.ones((paddle.shape(input_ids)[-1], paddle.shape(input_ids)[-1])) * -1e4, diagonal=1
+                paddle.ones((input_ids.shape[-1], input_ids.shape[-1])) * -1e4, diagonal=1
             )
             if attention_mask is not None:
                 if len(attention_mask.shape) == 2:
@@ -848,7 +848,7 @@ def get_logits_processor(
 
     def expand_inputs_for_generation(self, input_ids, expand_size, attention_mask=None, **model_kwargs):
 
-        index = paddle.tile(paddle.arange(paddle.shape(input_ids)[0]).unsqueeze(-1), [1, expand_size]).reshape([-1])
+        index = paddle.tile(paddle.arange(input_ids.shape[0]).unsqueeze(-1), [1, expand_size]).reshape([-1])
 
         input_ids = paddle.gather(input_ids, index)
 
@@ -1039,7 +1039,7 @@ def _post_process_(outputs, input_ids, cur_len, origin_len, scores, unfinished_f
                         raise ImportError(
                             "please install ppfleetx_ops by 'cd ppfleetx/ops && python setup_cuda.py install'!"
                         )
-                    top_ps_tensor = paddle.full(shape=[paddle.shape(probs)[0]], fill_value=top_p, dtype=probs.dtype)
+                    top_ps_tensor = paddle.full(shape=[probs.shape[0]], fill_value=top_p, dtype=probs.dtype)
                     _, next_tokens = topp_sampling(probs, top_ps_tensor, random_seed=100)
                 else:
                     probs = TopPProcess(probs, top_p, min_tokens_to_keep)
@@ -1194,7 +1194,7 @@ def forward(self, input_ids=None, **model_kwargs):
 
         if model_kwargs.get("position_ids", None) is None:
             model_kwargs["position_ids"] = paddle.arange(
-                0, paddle.shape(model_kwargs["attention_mask"])[-1], dtype=input_ids.dtype
+                0, model_kwargs["attention_mask"].shape[-1], dtype=input_ids.dtype
             ).unsqueeze(0)
 
         self.is_encoder_decoder = False
diff --git a/paddlenlp/experimental/transformers/bloom/modeling.py b/paddlenlp/experimental/transformers/bloom/modeling.py
index 659826fe6f1b..fbb983622fef 100644
--- a/paddlenlp/experimental/transformers/bloom/modeling.py
+++ b/paddlenlp/experimental/transformers/bloom/modeling.py
@@ -279,7 +279,7 @@ def forward(
                 pre_caches=pre_caches,
                 pre_caches_length=position_offset,
                 seq_lens=seq_len,
-                time_step=paddle.increment(paddle.shape(attention_mask)[-1], -1) if is_decoder else None,
+                time_step=paddle.increment(attention_mask.shape[-1], -1) if is_decoder else None,
             )
 
         # Add last hidden state
diff --git a/paddlenlp/experimental/transformers/chatglm_v2/modeling.py b/paddlenlp/experimental/transformers/chatglm_v2/modeling.py
index 75dd08396398..712f03dde2fe 100644
--- a/paddlenlp/experimental/transformers/chatglm_v2/modeling.py
+++ b/paddlenlp/experimental/transformers/chatglm_v2/modeling.py
@@ -285,7 +285,7 @@ def forward(
                 seq_lens=seq_lens,
                 rotary_embs=paddle.cast(rotary_pos_emb, "float32"),
                 rotary_emb_dims=1,
-                time_step=paddle.increment(paddle.shape(attention_mask)[-1], -1) if is_decoder else None,
+                time_step=paddle.increment(attention_mask.shape[-1], -1) if is_decoder else None,
             )
 
         hidden_states = self.final_layernorm(hidden_states)
diff --git a/paddlenlp/experimental/transformers/gpt/modeling.py b/paddlenlp/experimental/transformers/gpt/modeling.py
index 6627c9e42abb..b987c7a70974 100644
--- a/paddlenlp/experimental/transformers/gpt/modeling.py
+++ b/paddlenlp/experimental/transformers/gpt/modeling.py
@@ -265,7 +265,7 @@ def forward(
                 attn_mask=paddle.cast(attention_mask, dtype=hidden_states.dtype),
                 caches=cache_kvs,
                 seq_lens=seq_lens,
-                time_step=paddle.increment(paddle.shape(attention_mask)[-1], -1) if is_decoder else None,
+                time_step=paddle.increment(attention_mask.shape[-1], -1) if is_decoder else None,
             )
 
         hidden_states = self.norm(hidden_states)
diff --git a/paddlenlp/experimental/transformers/llama/modeling.py b/paddlenlp/experimental/transformers/llama/modeling.py
index f22eecb15d19..a67731916a92 100644
--- a/paddlenlp/experimental/transformers/llama/modeling.py
+++ b/paddlenlp/experimental/transformers/llama/modeling.py
@@ -451,7 +451,7 @@ def forward(
                 seq_lens=seq_lens,
                 rotary_embs=new_rope,
                 rotary_emb_dims=1,
-                time_step=paddle.increment(paddle.shape(attention_mask)[-1], -1) if is_decoder else None,
+                time_step=paddle.increment(attention_mask.shape[-1], -1) if is_decoder else None,
             )
         hidden_states = self.norm(hidden_states)
 
diff --git a/paddlenlp/experimental/transformers/opt/modeling.py b/paddlenlp/experimental/transformers/opt/modeling.py
index afcb1331b52c..2f8228a75947 100644
--- a/paddlenlp/experimental/transformers/opt/modeling.py
+++ b/paddlenlp/experimental/transformers/opt/modeling.py
@@ -247,7 +247,7 @@ def forward(
                 seq_lens=seq_lens,
                 rotary_embs=None,
                 rotary_emb_dims=0,
-                time_step=paddle.increment(paddle.shape(attention_mask)[-1], -1) if is_decoder else None,
+                time_step=paddle.increment(attention_mask.shape[-1], -1) if is_decoder else None,
             )
 
         output = hidden_states
diff --git a/paddlenlp/experimental/transformers/qwen/modeling.py b/paddlenlp/experimental/transformers/qwen/modeling.py
index fc6bb92a627d..975bf22abf15 100644
--- a/paddlenlp/experimental/transformers/qwen/modeling.py
+++ b/paddlenlp/experimental/transformers/qwen/modeling.py
@@ -340,7 +340,7 @@ def forward(
                 seq_lens=seq_lens,
                 rotary_embs=new_rope,
                 rotary_emb_dims=1,
-                time_step=paddle.increment(paddle.shape(attention_mask)[-1], -1) if is_decoder else None,
+                time_step=paddle.increment(attention_mask.shape[-1], -1) if is_decoder else None,
             )
 
         hidden_states = self.ln_f(hidden_states)
diff --git a/paddlenlp/generation/utils.py b/paddlenlp/generation/utils.py
index 625b81d765ff..ffd34b1d79cd 100644
--- a/paddlenlp/generation/utils.py
+++ b/paddlenlp/generation/utils.py
@@ -412,9 +412,9 @@ def get_logits_processor(
     @staticmethod
     def expand_inputs_for_generation(input_ids, expand_size, attention_mask=None, **model_kwargs):
 
-        index = paddle.tile(
-            paddle.arange(paddle.shape(input_ids)[0], dtype="int64").unsqueeze(-1), [1, expand_size]
-        ).reshape([-1])
+        index = paddle.tile(paddle.arange(input_ids.shape[0], dtype="int64").unsqueeze(-1), [1, expand_size]).reshape(
+            [-1]
+        )
 
         input_ids = paddle.gather(input_ids, index)
 
@@ -1340,11 +1340,11 @@ def sample_d2s(
                 "you should not specify InputSpec for top_k and top_p parameters, one of InputSpec is expected"
             )
 
-        batch_size, cur_len = paddle.shape(input_ids)
+        batch_size, cur_len = input_ids.shape
         # used for compute on gpu, avoid memcpy D2H
         cur_len_gpu = paddle.full([1], cur_len, dtype="int64")
 
-        origin_len = paddle.shape(input_ids)[1]
+        origin_len = input_ids.shape[1]
         # used for compute on gpu, avoid memcpy D2H
         origin_len_gpu = paddle.full([1], origin_len, dtype="int64")
 
@@ -1384,7 +1384,7 @@ def _post_process_(outputs, input_ids, cur_len, origin_len, scores, unfinished_f
             # compute next_tokens
             if use_top_p:
                 logits = logits / temperature
-                top_ps_tensor = paddle.full(shape=[paddle.shape(probs)[0], 1], fill_value=top_p, dtype=probs.dtype)
+                top_ps_tensor = paddle.full(shape=[probs.shape[0], 1], fill_value=top_p, dtype=probs.dtype)
                 _, next_tokens = paddle.tensor.top_p_sampling(probs, top_ps_tensor)
             else:
                 probs = TopKProcess(probs, top_k, min_tokens_to_keep)
@@ -1428,7 +1428,7 @@ def _post_process_(outputs, input_ids, cur_len, origin_len, scores, unfinished_f
 
         attn_mask = model_kwargs["attention_mask"]
         # make the shape of attention_mask = (-1, -1, -1, -1) in dy2static.
-        model_kwargs["attention_mask"] = paddle.reshape(attn_mask, paddle.shape(attn_mask))
+        model_kwargs["attention_mask"] = paddle.reshape(attn_mask, attn_mask.shape)
         model_kwargs["cache"] = outputs[1] if isinstance(outputs, tuple) else None
         max_new_tokens = paddle.full([1], max_new_tokens + cur_len - 1, dtype="int64")
 
diff --git a/paddlenlp/layers/crf.py b/paddlenlp/layers/crf.py
index aaaec528ca5f..5dc6c6363afc 100644
--- a/paddlenlp/layers/crf.py
+++ b/paddlenlp/layers/crf.py
@@ -303,7 +303,7 @@ def __init__(self, transitions, with_start_stop_tag=True):
         if with_start_stop_tag:
             self.start_idx = -1
             self.stop_idx = -2
-        self.num_tags = paddle.shape(transitions)[0]
+        self.num_tags = transitions.shape[0]
 
         self._initial_alpha = None
         self._index = None
@@ -312,7 +312,7 @@ def __init__(self, transitions, with_start_stop_tag=True):
 
     def _initialize_alpha(self, batch_size):
         # alpha accumulate the path value to get the different next tag
-        if self._initial_alpha is None or batch_size > paddle.shape(self._initial_alpha)[0]:
+        if self._initial_alpha is None or batch_size > self._initial_alpha.shape[0]:
             # Initialized by a small value.
             initial_alpha = paddle.full([batch_size, self.num_tags - 1], dtype="float32", fill_value=-10000.0)
             # alpha_start fill_value = 0. > -10000., means the first one step START gets the most score.
@@ -336,7 +336,7 @@ def forward(self, inputs, lengths):
             The `paths` tensor containing the highest scoring tag indices.
             Its dtype is int64 and has a shape of `[batch_size, sequence_length]`.
         """
-        input_shape = paddle.shape(inputs)
+        input_shape = inputs.shape
         batch_size = input_shape[0]
         n_label = input_shape[2]
 
@@ -412,6 +412,6 @@ def forward(self, inputs, lengths):
         return scores, batch_path
 
     def _get_batch_index(self, batch_size):
-        if self._batch_index is None or batch_size != paddle.shape(self._batch_index)[0]:
+        if self._batch_index is None or batch_size != self._batch_index.shape[0]:
             self._batch_index = paddle.arange(end=batch_size, dtype="int64")
         return self._batch_index
diff --git a/paddlenlp/layers/globalpointer.py b/paddlenlp/layers/globalpointer.py
index d11aedc9ddb0..a76c606098b6 100644
--- a/paddlenlp/layers/globalpointer.py
+++ b/paddlenlp/layers/globalpointer.py
@@ -26,7 +26,7 @@ def __init__(self, dim, max_seq_len=512):
         self.register_buffer("cos", freqs.cos(), persistable=False)
 
     def forward(self, x, offset=0):
-        seqlen = paddle.shape(x)[-2]
+        seqlen = x.shape[-2]
         sin, cos = (
             self.sin[offset : offset + seqlen, :],
             self.cos[offset : offset + seqlen, :],
diff --git a/paddlenlp/ops/fast_transformer/transformer/decoder.py b/paddlenlp/ops/fast_transformer/transformer/decoder.py
index 988b861e810d..82b0f2339aec 100644
--- a/paddlenlp/ops/fast_transformer/transformer/decoder.py
+++ b/paddlenlp/ops/fast_transformer/transformer/decoder.py
@@ -275,7 +275,7 @@ def forward(
                 [
                     self_cache_key,
                     paddle.zeros(
-                        shape=[len(self.weights), 1, paddle.shape(memory_tensor)[0], self.n_head * self.size_per_head],
+                        shape=[len(self.weights), 1, memory_tensor.shape[0], self.n_head * self.size_per_head],
                         dtype=self_cache_key.dtype,
                     ),
                 ],
@@ -285,7 +285,7 @@ def forward(
                 [
                     self_cache_value,
                     paddle.zeros(
-                        shape=[len(self.weights), 1, paddle.shape(memory_tensor)[0], self.n_head * self.size_per_head],
+                        shape=[len(self.weights), 1, memory_tensor.shape[0], self.n_head * self.size_per_head],
                         dtype=self_cache_value.dtype,
                     ),
                 ],
@@ -458,7 +458,7 @@ def __init__(
             self.linear = nn.Linear(in_features=d_model, out_features=trg_vocab_size, bias_attr=False)
 
     def forward(self, src_word):
-        src_max_len = paddle.shape(src_word)[-1]
+        src_max_len = src_word.shape[-1]
         mem_seq_lens = paddle.sum(
             paddle.cast(src_word != self.bos_id, dtype="int32"), axis=-1, keepdim=True, dtype="int32"
         )
diff --git a/paddlenlp/ops/fast_transformer/transformer/decoding.py b/paddlenlp/ops/fast_transformer/transformer/decoding.py
index 8cac1f9026ba..28b30faebc2b 100644
--- a/paddlenlp/ops/fast_transformer/transformer/decoding.py
+++ b/paddlenlp/ops/fast_transformer/transformer/decoding.py
@@ -2572,7 +2572,7 @@ def parse_function(func_name):
                     memory_seq_lens, self._beam_size
                 )
             else:
-                enc_output_shape = paddle.shape(enc_output)
+                enc_output_shape = enc_output.shape
                 batch_size = enc_output_shape[0]
                 max_seq_len = enc_output_shape[1]
                 enc_output = enc_output.unsqueeze([1])
@@ -2995,7 +2995,7 @@ def forward(
         temperature=1,
     ):
         if attention_mask is None:
-            batch_size = paddle.shape(input_ids)[0]
+            batch_size = input_ids.shape[0]
             attention_mask = paddle.tril(
                 paddle.ones(
                     [batch_size, mem_seq_len, mem_seq_len], dtype="float16" if self.use_fp16_decoding else "float32"
@@ -3042,7 +3042,7 @@ def forward(
             use_fp16_decoding=self.use_fp16_decoding,
         )
 
-        output_ids = output_ids[paddle.shape(input_ids)[-1] :, :]
+        output_ids = output_ids[input_ids.shape[-1] :, :]
         if forced_eos_token_id is not None:
             output_ids[:, -1] = forced_eos_token_id
         return output_ids
@@ -3100,7 +3100,7 @@ def forward(
         temperature=1,
     ):
         if attention_mask is None:
-            batch_size = paddle.shape(input_ids)[0]
+            batch_size = input_ids.shape[0]
             attention_mask = paddle.tril(
                 paddle.ones(
                     [batch_size, paddle.max(mem_seq_len), paddle.max(mem_seq_len)],
@@ -3147,7 +3147,7 @@ def forward(
             use_fp16_decoding=self.use_fp16_decoding,
         )
 
-        output_ids = output_ids[paddle.shape(input_ids)[-1] :, :]
+        output_ids = output_ids[input_ids.shape[-1] :, :]
         if forced_eos_token_id is not None:
             output_ids[:, -1] = forced_eos_token_id
         return output_ids
@@ -4117,7 +4117,7 @@ def forward(
         min_length=0,
     ):
         if attention_mask is None:
-            batch_size, input_length = paddle.shape(input_ids)
+            batch_size, input_length = input_ids.shape
             attention_mask = paddle.unsqueeze((input_ids != pad_token_id).astype("float32"), axis=[1])
             causal_mask = paddle.tril(paddle.ones([batch_size, input_length, input_length], dtype="float32"))
             attention_mask = paddle.logical_and(attention_mask, causal_mask)
@@ -4161,7 +4161,7 @@ def forward(
             use_fp16_decoding=self.use_fp16_decoding,
         )
 
-        output_ids = output_ids[paddle.shape(input_ids)[-1] :, :]
+        output_ids = output_ids[input_ids.shape[-1] :, :]
         if forced_eos_token_id is not None:
             output_ids[:, -1] = forced_eos_token_id
         return output_ids
diff --git a/paddlenlp/ops/fast_transformer/transformer/fast_transformer.py b/paddlenlp/ops/fast_transformer/transformer/fast_transformer.py
index af0ecd3f3101..b7b87c47a4c2 100644
--- a/paddlenlp/ops/fast_transformer/transformer/fast_transformer.py
+++ b/paddlenlp/ops/fast_transformer/transformer/fast_transformer.py
@@ -235,7 +235,7 @@ def __init__(
         )
 
     def forward(self, src_word, trg_word=None):
-        src_max_len = paddle.shape(src_word)[-1]
+        src_max_len = src_word.shape[-1]
         src_slf_attn_bias = (
             paddle.cast(src_word == self.pad_id, dtype=paddle.get_default_dtype()).unsqueeze([1, 2]) * -1e9
         )
@@ -1619,7 +1619,7 @@ def forward(
             encoder_output = self.prepare_encoder_decoder_kwargs_for_generation(input_ids, model_kwargs)[
                 "encoder_output"
             ]
-        batch_size = paddle.shape(encoder_output)[0]
+        batch_size = encoder_output.shape[0]
         if seq_len is None:
             assert input_ids is not None, "You have to specify either input_ids when generating seq_len."
             seq_len = paddle.sum(paddle.cast(input_ids != self.pad_token_id, dtype="int32"), axis=-1, dtype="int32")
@@ -1649,7 +1649,7 @@ def forward(
             else:
                 forced_bos_token_id = paddle.zeros([0])
         elif decode_strategy == "sampling":
-            num_samples = paddle.shape(encoder_output)[0]
+            num_samples = encoder_output.shape[0]
             forced_bos_token_id = paddle.expand(forced_bos_token_id, shape=[num_samples, 1])
 
         return self.decoding(
diff --git a/paddlenlp/prompt/prompt_utils.py b/paddlenlp/prompt/prompt_utils.py
index d230fbf1ab41..f446154aa79e 100644
--- a/paddlenlp/prompt/prompt_utils.py
+++ b/paddlenlp/prompt/prompt_utils.py
@@ -198,9 +198,7 @@ def masked_lm_forward_with_past_key_values(
     masked_lm_loss = None
     if labels is not None:
         loss_fct = paddle.nn.CrossEntropyLoss()
-        masked_lm_loss = loss_fct(
-            prediction_scores.reshape((-1, paddle.shape(prediction_scores)[-1])), labels.reshape((-1,))
-        )
+        masked_lm_loss = loss_fct(prediction_scores.reshape((-1, prediction_scores.shape[-1])), labels.reshape((-1,)))
 
     return MaskedLMOutput(
         loss=masked_lm_loss,
diff --git a/paddlenlp/trainer/trainer_compress.py b/paddlenlp/trainer/trainer_compress.py
index 27629e2778e7..f2f945cd128f 100644
--- a/paddlenlp/trainer/trainer_compress.py
+++ b/paddlenlp/trainer/trainer_compress.py
@@ -871,9 +871,9 @@ def auto_model_dynabert_forward(
     if input_ids is not None and inputs_embeds is not None:
         raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time.")
     elif input_ids is not None:
-        input_shape = paddle.shape(input_ids)
+        input_shape = input_ids.shape
     elif inputs_embeds is not None:
-        input_shape = paddle.shape(inputs_embeds)[:-1]
+        input_shape = inputs_embeds.shape[:-1]
     else:
         raise ValueError("You have to specify either input_ids or inputs_embeds")
 
diff --git a/paddlenlp/transformers/albert/modeling.py b/paddlenlp/transformers/albert/modeling.py
index 362c9a5527c1..465bb0738b66 100644
--- a/paddlenlp/transformers/albert/modeling.py
+++ b/paddlenlp/transformers/albert/modeling.py
@@ -1408,7 +1408,7 @@ def forward(
             if start_positions.ndim > 1:
                 end_positions = end_positions.squeeze(-1)
             # sometimes the start/end positions are outside our model inputs, we ignore these terms
-            ignored_index = paddle.shape(start_logits)[1]
+            ignored_index = start_logits.shape[1]
             start_positions = start_positions.clip(0, ignored_index)
             end_positions = end_positions.clip(0, ignored_index)
 
diff --git a/paddlenlp/transformers/bart/modeling.py b/paddlenlp/transformers/bart/modeling.py
index e9cfcd08c33f..7b62163d6f00 100644
--- a/paddlenlp/transformers/bart/modeling.py
+++ b/paddlenlp/transformers/bart/modeling.py
@@ -453,10 +453,10 @@ def forward(
         if input_ids is None and inputs_embeds is None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
-            inputs_shape = paddle.shape(input_ids)
+            inputs_shape = input_ids.shape
             input_ids = input_ids.reshape((-1, inputs_shape[-1]))
         elif inputs_embeds is not None:
-            inputs_shape = paddle.shape(inputs_embeds)[:-1]
+            inputs_shape = inputs_embeds.shape[:-1]
         else:
             raise ValueError("You have to specify either input_ids or inputs_embeds")
 
@@ -566,10 +566,10 @@ def forward(
         if decoder_input_ids is not None and decoder_inputs_embeds is not None:
             raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
         elif decoder_input_ids is not None:
-            inputs_shape = paddle.shape(decoder_input_ids)
+            inputs_shape = decoder_input_ids.shape
             decoder_input_ids = decoder_input_ids.reshape((-1, inputs_shape[-1]))
         elif decoder_inputs_embeds is not None:
-            inputs_shape = paddle.shape(decoder_inputs_embeds)[:-1]
+            inputs_shape = decoder_inputs_embeds.shape[:-1]
         else:
             raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
 
@@ -582,7 +582,7 @@ def forward(
         if decoder_inputs_embeds is None:
             decoder_inputs_embeds = self.embed_tokens(decoder_input_ids) * self.embed_scale
 
-        past_key_values_length = paddle.shape(cache[0][0].k)[2] if cache is not None else 0
+        past_key_values_length = cache[0][0].k.shape[2] if cache is not None else 0
         decoder_inputs_embed_pos = self.decoder_embed_positions(inputs_shape, past_key_values_length)
         hidden_states = decoder_inputs_embeds + decoder_inputs_embed_pos
         hidden_states = self.decoder_layernorm_embedding(hidden_states)
@@ -976,7 +976,7 @@ def forward(
             return_dict=return_dict,
         )
         output = outputs[0]
-        output_shape = paddle.shape(output)
+        output_shape = output.shape
 
         if input_ids is not None:
             eos_mask = paddle.cast(input_ids == self.bart.config["eos_token_id"], dtype="int64")
@@ -1168,7 +1168,7 @@ def forward(
             if start_positions.ndim > 1:
                 end_positions = end_positions.squeeze(-1)
             # sometimes the start/end positions are outside our model inputs, we ignore these terms
-            ignored_index = paddle.shape(start_logits)[1]
+            ignored_index = start_logits.shape[1]
             start_positions = start_positions.clip(0, ignored_index)
             end_positions = end_positions.clip(0, ignored_index)
 
diff --git a/paddlenlp/transformers/bert/modeling.py b/paddlenlp/transformers/bert/modeling.py
index ed980805f904..03095def4cd3 100644
--- a/paddlenlp/transformers/bert/modeling.py
+++ b/paddlenlp/transformers/bert/modeling.py
@@ -599,7 +599,7 @@ def forward(
             if start_positions.ndim > 1:
                 end_positions = end_positions.squeeze(-1)
             # sometimes the start/end positions are outside our model inputs, we ignore these terms
-            ignored_index = paddle.shape(start_logits)[1]
+            ignored_index = start_logits.shape[1]
             start_positions = start_positions.clip(0, ignored_index)
             end_positions = end_positions.clip(0, ignored_index)
 
diff --git a/paddlenlp/transformers/bigbird/modeling.py b/paddlenlp/transformers/bigbird/modeling.py
index 771367d333e2..0effe9d18137 100644
--- a/paddlenlp/transformers/bigbird/modeling.py
+++ b/paddlenlp/transformers/bigbird/modeling.py
@@ -238,10 +238,10 @@ def forward(
         inputs_embeds: Optional[Tensor] = None,
     ):
         if input_ids is not None:
-            input_shape = paddle.shape(input_ids)
+            input_shape = input_ids.shape
             inputs_embeds = self.word_embeddings(input_ids)
         else:
-            input_shape = paddle.shape(inputs_embeds)[:-1]
+            input_shape = inputs_embeds.shape[:-1]
 
         if position_ids is None:
             ones = paddle.ones(input_shape, dtype="int64")
@@ -382,7 +382,7 @@ def _process_mask(self, input_ids, inputs_embeds, attention_mask=None):
         if input_ids is not None:
             attention_mask = (input_ids == self.pad_token_id).astype(self.pooler.dense.weight.dtype)
         else:
-            input_shape = paddle.shape(inputs_embeds)[:-1]
+            input_shape = inputs_embeds.shape[:-1]
             attention_mask = paddle.zeros(input_shape, dtype=self.pooler.dense.weight.dtype)
 
         # [B, 1, T, 1]
@@ -1219,7 +1219,7 @@ def forward(
             if start_positions.ndim > 1:
                 end_positions = end_positions.squeeze(-1)
             # sometimes the start/end positions are outside our model inputs, we ignore these terms
-            ignored_index = paddle.shape(start_logits)[1]
+            ignored_index = start_logits.shape[1]
             start_positions = start_positions.clip(0, ignored_index)
             end_positions = end_positions.clip(0, ignored_index)
 
diff --git a/paddlenlp/transformers/blenderbot/modeling.py b/paddlenlp/transformers/blenderbot/modeling.py
index 8a15bc75c173..fb1fcfcd78e0 100644
--- a/paddlenlp/transformers/blenderbot/modeling.py
+++ b/paddlenlp/transformers/blenderbot/modeling.py
@@ -339,7 +339,7 @@ def forward(
         if decoder_input_ids is None:
             raise ValueError("Decoder_input_ids cannot be None.")
         if decoder_attention_mask is None:
-            decoder_length = paddle.shape(decoder_input_ids)[-1]
+            decoder_length = decoder_input_ids.shape[-1]
             decoder_attention_mask = paddle.tensor.triu(
                 (paddle.full((decoder_length, decoder_length), -np.inf, dtype=paddle.get_default_dtype())), 1
             )
diff --git a/paddlenlp/transformers/blenderbot_small/modeling.py b/paddlenlp/transformers/blenderbot_small/modeling.py
index feedfe5f6c67..74fe6b764426 100644
--- a/paddlenlp/transformers/blenderbot_small/modeling.py
+++ b/paddlenlp/transformers/blenderbot_small/modeling.py
@@ -341,7 +341,7 @@ def forward(
         if decoder_input_ids is None:
             raise ValueError("Decoder_input_ids cannot be None.")
         if decoder_attention_mask is None:
-            decoder_length = paddle.shape(decoder_input_ids)[-1]
+            decoder_length = decoder_input_ids.shape[-1]
             decoder_attention_mask = paddle.tensor.triu(
                 (paddle.full((decoder_length, decoder_length), -np.inf, dtype=paddle.get_default_dtype())), 1
             )
diff --git a/paddlenlp/transformers/bloom/modeling.py b/paddlenlp/transformers/bloom/modeling.py
index 25f54f84d8dc..f18b88f406e0 100755
--- a/paddlenlp/transformers/bloom/modeling.py
+++ b/paddlenlp/transformers/bloom/modeling.py
@@ -1546,7 +1546,7 @@ def get_logits_processor(
 
     def expand_inputs_for_generation(self, input_ids, expand_size, attention_mask=None, **model_kwargs):
 
-        index = paddle.tile(paddle.arange(paddle.shape(input_ids)[0]).unsqueeze(-1), [1, expand_size]).reshape([-1])
+        index = paddle.tile(paddle.arange(input_ids.shape[0]).unsqueeze(-1), [1, expand_size]).reshape([-1])
 
         input_ids = paddle.gather(input_ids, index)
 
@@ -1654,12 +1654,12 @@ def TopPProcess(probs, top_p, min_tokens_to_keep):
             probs = paddle.where(condition, paddle.full_like(probs, 0.0), probs)
             return probs
 
-        batch_size, cur_len = paddle.shape(input_ids)
+        batch_size, cur_len = input_ids.shape
 
         # used for compute on gpu, avoid memcpy D2H
         cur_len_gpu = paddle.full([1], cur_len)
 
-        origin_len = paddle.shape(input_ids)[1]
+        origin_len = input_ids.shape[1]
         # used for compute on gpu, avoid memcpy D2H
         origin_len_gpu = paddle.full([1], origin_len)
 
@@ -1721,7 +1721,7 @@ def _post_process_(outputs, input_ids, cur_len, origin_len, scores, unfinished_f
                         raise ImportError(
                             "please install ppfleetx_ops by 'cd ppfleetx/ops && python setup_cuda.py install'!"
                         )
-                    top_ps_tensor = paddle.full(shape=[paddle.shape(probs)[0]], fill_value=top_p, dtype=probs.dtype)
+                    top_ps_tensor = paddle.full(shape=[probs.shape[0]], fill_value=top_p, dtype=probs.dtype)
                     next_tokens = topp_sampling(probs, top_ps_tensor)
                 else:
                     probs = TopPProcess(probs, top_p, min_tokens_to_keep)
@@ -1766,7 +1766,7 @@ def _post_process_(outputs, input_ids, cur_len, origin_len, scores, unfinished_f
 
         attn_mask = model_kwargs["attention_mask"]
         # make the shape of attention_mask = (-1, -1, -1, -1) in dy2static.
-        model_kwargs["attention_mask"] = paddle.reshape(attn_mask, paddle.shape(attn_mask))
+        model_kwargs["attention_mask"] = paddle.reshape(attn_mask, attn_mask.shape)
         model_kwargs["cache"] = outputs[1] if isinstance(outputs, tuple) else None
         max_length = paddle.to_tensor(max_length)
         while cur_len < max_length:
@@ -1855,7 +1855,7 @@ def forward(self, input_ids=None, **model_kwargs):
 
         if model_kwargs.get("position_ids", None) is None:
             model_kwargs["position_ids"] = paddle.arange(
-                0, paddle.shape(model_kwargs["attention_mask"])[-1], dtype=input_ids.dtype
+                0, model_kwargs["attention_mask"].shape[-1], dtype=input_ids.dtype
             ).unsqueeze(0)
 
         self.is_encoder_decoder = False
diff --git a/paddlenlp/transformers/convbert/modeling.py b/paddlenlp/transformers/convbert/modeling.py
index d5ec8e843c2a..fa64a09ae2b9 100644
--- a/paddlenlp/transformers/convbert/modeling.py
+++ b/paddlenlp/transformers/convbert/modeling.py
@@ -172,8 +172,8 @@ def forward(self, query, key=None, value=None, attn_mask=None, cache=None):
         v = self.v_proj(value)
 
         if self.conv_type == "sdconv":
-            bs = paddle.shape(q)[0]
-            seqlen = paddle.shape(q)[1]
+            bs = q.shape[0]
+            seqlen = q.shape[1]
             mixed_key_conv_attn_layer = self.key_conv_attn_layer(query)
             conv_attn_layer = mixed_key_conv_attn_layer * q
 
@@ -290,7 +290,7 @@ def forward(
         if input_ids is not None:
             inputs_embeds = self.word_embeddings(input_ids)
 
-        input_shape = paddle.shape(inputs_embeds)[:-1]
+        input_shape = inputs_embeds.shape[:-1]
 
         ones = paddle.ones(input_shape, dtype="int64")
         seq_length = paddle.cumsum(ones, axis=1)
@@ -1518,7 +1518,7 @@ def forward(
             if start_positions.ndim > 1:
                 end_positions = end_positions.squeeze(-1)
             # sometimes the start/end positions are outside our model inputs, we ignore these terms
-            ignored_index = paddle.shape(start_logits)[1]
+            ignored_index = start_logits.shape[1]
             start_positions = start_positions.clip(0, ignored_index)
             end_positions = end_positions.clip(0, ignored_index)
 
diff --git a/paddlenlp/transformers/dallebart/modeling.py b/paddlenlp/transformers/dallebart/modeling.py
index 1e5d50009363..06ced887439e 100644
--- a/paddlenlp/transformers/dallebart/modeling.py
+++ b/paddlenlp/transformers/dallebart/modeling.py
@@ -400,7 +400,7 @@ def forward(
             Its data type should be float32 and has a shape of [batch_size, sequence_length, hidden_size].
         """
         if decoder_attention_mask is None:
-            decoder_length = paddle.shape(decoder_input_ids)[-1]
+            decoder_length = decoder_input_ids.shape[-1]
             decoder_attention_mask = paddle.triu(
                 (
                     paddle.full(
@@ -412,8 +412,8 @@ def forward(
                 1,
             )
         decoder_inputs_embeds = self.embed_tokens(decoder_input_ids)
-        past_key_values_length = paddle.shape(cache[0][0].k)[2] if cache is not None else 0
-        decoder_inputs_embed_pos = self.embed_positions(paddle.shape(decoder_input_ids), past_key_values_length)
+        past_key_values_length = cache[0][0].k.shape[2] if cache is not None else 0
+        decoder_inputs_embed_pos = self.embed_positions(decoder_input_ids.shape, past_key_values_length)
         hidden_states = decoder_inputs_embeds + decoder_inputs_embed_pos
         hidden_states = self.layernorm_embedding(hidden_states)
         hidden_states = self.dropout(hidden_states)
diff --git a/paddlenlp/transformers/deberta/modeling.py b/paddlenlp/transformers/deberta/modeling.py
index 26a8e6d7789f..806e77d38cdb 100644
--- a/paddlenlp/transformers/deberta/modeling.py
+++ b/paddlenlp/transformers/deberta/modeling.py
@@ -1184,7 +1184,7 @@ def forward(
             if start_positions.ndim > 1:
                 end_positions = end_positions.squeeze(-1)
             # sometimes the start/end positions are outside our model inputs, we ignore these terms
-            ignored_index = paddle.shape(start_logits)[1]
+            ignored_index = start_logits.shape[1]
             start_positions = start_positions.clip(0, ignored_index)
             end_positions = end_positions.clip(0, ignored_index)
 
@@ -1331,9 +1331,9 @@ def forward(
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         if input_ids is not None:
-            num_choices = paddle.shape(input_ids)[1]
+            num_choices = input_ids.shape[1]
         elif inputs_embeds is not None:
-            num_choices = paddle.shape(inputs_embeds)[1]
+            num_choices = inputs_embeds.shape[1]
 
         input_ids = input_ids.reshape((-1, input_ids.shape[-1])) if input_ids is not None else None
         inputs_embeds = (
diff --git a/paddlenlp/transformers/deberta_v2/modeling.py b/paddlenlp/transformers/deberta_v2/modeling.py
index 7f0aa4679e26..0779780feaf7 100644
--- a/paddlenlp/transformers/deberta_v2/modeling.py
+++ b/paddlenlp/transformers/deberta_v2/modeling.py
@@ -1288,7 +1288,7 @@ def forward(
             if start_positions.ndim > 1:
                 end_positions = end_positions.squeeze(-1)
             # sometimes the start/end positions are outside our model inputs, we ignore these terms
-            ignored_index = paddle.shape(start_logits)[1]
+            ignored_index = start_logits.shape[1]
             start_positions = start_positions.clip(0, ignored_index)
             end_positions = end_positions.clip(0, ignored_index)
 
@@ -1435,9 +1435,9 @@ def forward(
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         if input_ids is not None:
-            num_choices = paddle.shape(input_ids)[1]
+            num_choices = input_ids.shape[1]
         elif inputs_embeds is not None:
-            num_choices = paddle.shape(inputs_embeds)[1]
+            num_choices = inputs_embeds.shape[1]
 
         input_ids = input_ids.reshape((-1, input_ids.shape[-1])) if input_ids is not None else None
         inputs_embeds = (
diff --git a/paddlenlp/transformers/electra/modeling.py b/paddlenlp/transformers/electra/modeling.py
index b3b0b67c1a3d..722795bf6dac 100644
--- a/paddlenlp/transformers/electra/modeling.py
+++ b/paddlenlp/transformers/electra/modeling.py
@@ -1783,7 +1783,7 @@ def forward(
             if start_positions.ndim > 1:
                 end_positions = end_positions.squeeze(-1)
             # sometimes the start/end positions are outside our model inputs, we ignore these terms
-            ignored_index = paddle.shape(start_logits)[1]
+            ignored_index = start_logits.shape[1]
             start_positions = start_positions.clip(0, ignored_index)
             end_positions = end_positions.clip(0, ignored_index)
 
diff --git a/paddlenlp/transformers/ernie/modeling.py b/paddlenlp/transformers/ernie/modeling.py
index 754e220a883b..0833a5a8b387 100644
--- a/paddlenlp/transformers/ernie/modeling.py
+++ b/paddlenlp/transformers/ernie/modeling.py
@@ -97,7 +97,7 @@ def forward(
         if input_ids is not None:
             inputs_embeds = self.word_embeddings(input_ids)
 
-        input_shape = inputs_embeds.shape[:-1] if in_declarative_mode() else paddle.shape(inputs_embeds)[:-1]
+        input_shape = inputs_embeds.shape[:-1] if in_declarative_mode() else inputs_embeds.shape[:-1]
 
         if position_ids is None:
             # maybe need use shape op to unify static graph and dynamic graph
@@ -611,7 +611,7 @@ def forward(
             if start_positions.ndim > 1:
                 end_positions = end_positions.squeeze(-1)
             # sometimes the start/end positions are outside our model inputs, we ignore these terms
-            ignored_index = paddle.shape(start_logits)[1]
+            ignored_index = start_logits.shape[1]
             start_positions = start_positions.clip(0, ignored_index)
             end_positions = end_positions.clip(0, ignored_index)
 
@@ -911,7 +911,7 @@ def forward(
             if labels is not None and next_sentence_label is not None:
                 loss_fct = paddle.nn.CrossEntropyLoss()
                 masked_lm_loss = loss_fct(
-                    prediction_scores.reshape((-1, paddle.shape(prediction_scores)[-1])), labels.reshape((-1,))
+                    prediction_scores.reshape((-1, prediction_scores.shape[-1])), labels.reshape((-1,))
                 )
                 next_sentence_loss = loss_fct(
                     seq_relationship_score.reshape((-1, 2)), next_sentence_label.reshape((-1,))
@@ -1088,7 +1088,7 @@ def forward(
         if labels is not None:
             loss_fct = paddle.nn.CrossEntropyLoss()  # -100 index = padding token
             masked_lm_loss = loss_fct(
-                prediction_scores.reshape((-1, paddle.shape(prediction_scores)[-1])), labels.reshape((-1,))
+                prediction_scores.reshape((-1, prediction_scores.shape[-1])), labels.reshape((-1,))
             )
         if not return_dict:
             output = (prediction_scores,) + outputs[2:]
diff --git a/paddlenlp/transformers/ernie_code/modeling.py b/paddlenlp/transformers/ernie_code/modeling.py
index 0649966c64f3..d83e1423b3be 100644
--- a/paddlenlp/transformers/ernie_code/modeling.py
+++ b/paddlenlp/transformers/ernie_code/modeling.py
@@ -286,15 +286,15 @@ def forward(
         # Input is (batch_size, seq_length, dim)
         # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length)
         # cache[0] is (batch_size, n_heads, q_len - 1, dim_per_head)
-        batch_size, seq_length = paddle.shape(hidden_states)[:2]
+        batch_size, seq_length = hidden_states.shape[:2]
 
         real_seq_length = seq_length
 
         if cache is not None:
             assert len(cache) == 2, f"cache should have 2 past states: keys and values. Got { len(cache)} past states"
-            real_seq_length += paddle.shape(cache[0])[2] if query_length is None else query_length
+            real_seq_length += cache[0].shape[2] if query_length is None else query_length
 
-        key_length = real_seq_length if key_value_states is None else paddle.shape(key_value_states)[1]
+        key_length = real_seq_length if key_value_states is None else key_value_states.shape[1]
 
         def shape(states):
             """projection"""
@@ -361,7 +361,7 @@ def project(hidden_states, proj_layer, key_value_states, cache):
             # if key and values are already calculated
             # we want only the last query position bias
             if cache is not None:
-                position_bias = position_bias[:, :, -paddle.shape(hidden_states)[1] :, :]
+                position_bias = position_bias[:, :, -hidden_states.shape[1] :, :]
 
             if mask is not None:
                 position_bias = position_bias + mask  # (batch_size, n_heads, seq_length, key_length)
@@ -514,7 +514,7 @@ def forward(
             # the actual query length is unknown for cross attention
             # if using past key value states. Need to inject it here
             if present_key_value_state is not None:
-                query_length = paddle.shape(present_key_value_state[0])[2]
+                query_length = present_key_value_state[0].shape[2]
             else:
                 query_length = None
 
@@ -875,10 +875,10 @@ def forward(
                 f"You cannot specify both {err_msg_prefix}input_ids and {err_msg_prefix}inputs_embeds at the same time"
             )
         elif input_ids is not None:
-            input_shape = paddle.shape(input_ids)
+            input_shape = input_ids.shape
             # input_ids = input_ids.reshape(shape=[-1, input_shape[-1]])
         elif inputs_embeds is not None:
-            input_shape = paddle.shape(inputs_embeds)[:-1]
+            input_shape = inputs_embeds.shape[:-1]
         else:
             err_msg_prefix = "decoder_" if self.is_decoder else ""
             raise ValueError(f"You have to specify either {err_msg_prefix}input_ids or {err_msg_prefix}inputs_embeds")
@@ -890,7 +890,7 @@ def forward(
         batch_size, seq_length = input_shape
 
         # required mask seq length can be calculated via length of past
-        mask_seq_length = paddle.shape(cache[0][0])[2] + seq_length if cache is not None else seq_length
+        mask_seq_length = cache[0][0].shape[2] + seq_length if cache is not None else seq_length
 
         if use_cache is True:
             assert self.is_decoder, f"`use_cache` can only be set to `True` if {self.__class__} is used as a decoder"
@@ -898,7 +898,7 @@ def forward(
         if attention_mask is None:
             attention_mask = paddle.ones(shape=[batch_size, mask_seq_length])
         if self.is_decoder and encoder_attention_mask is None and encoder_hidden_states is not None:
-            encoder_seq_length = paddle.shape(encoder_hidden_states)[1]
+            encoder_seq_length = encoder_hidden_states.shape[1]
             encoder_attention_mask = paddle.ones([batch_size, encoder_seq_length], dtype=paddle.int64)
 
         # initialize caches with `None` if past does not exist
@@ -912,7 +912,7 @@ def forward(
         # If a 2D or 3D attention mask is provided for the cross-attention
         # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
         if self.is_decoder and encoder_hidden_states is not None:
-            encoder_batch_size, encoder_sequence_length, _ = paddle.shape(encoder_hidden_states)
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.shape
             encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
             if encoder_attention_mask is None:
                 encoder_attention_mask = paddle.ones(shape=encoder_hidden_shape)
diff --git a/paddlenlp/transformers/ernie_ctm/modeling.py b/paddlenlp/transformers/ernie_ctm/modeling.py
index b7db01b3f662..c3449ddc175f 100644
--- a/paddlenlp/transformers/ernie_ctm/modeling.py
+++ b/paddlenlp/transformers/ernie_ctm/modeling.py
@@ -100,7 +100,7 @@ def __init__(self, config: ErnieCtmConfig):
     def forward(self, input_ids, token_type_ids=None, position_ids=None, inputs_embeds=None):
         if position_ids is None:
 
-            content_len = paddle.shape(input_ids)[1] - self.cls_num
+            content_len = input_ids.shape[1] - self.cls_num
             position_ids = paddle.concat(
                 [
                     paddle.zeros(shape=[self.cls_num], dtype="int64"),
diff --git a/paddlenlp/transformers/ernie_gen/modeling.py b/paddlenlp/transformers/ernie_gen/modeling.py
index c0ac93636435..3a0a2f5fa3f4 100644
--- a/paddlenlp/transformers/ernie_gen/modeling.py
+++ b/paddlenlp/transformers/ernie_gen/modeling.py
@@ -493,7 +493,7 @@ def forward(
         assert (
             attn_bias is not None if past_cache else True
         ), "if `past_cache` is specified; attn_bias should not be None"
-        d_seqlen = paddle.shape(src_ids)[1]
+        d_seqlen = src_ids.shape[1]
         if pos_ids is None:
             pos_ids = paddle.arange(0, d_seqlen, 1, dtype="int32").reshape([1, -1]).cast("int64")
         if attn_bias is None:
diff --git a/paddlenlp/transformers/ernie_gram/modeling.py b/paddlenlp/transformers/ernie_gram/modeling.py
index b4aef71dac04..438ee1b95c92 100644
--- a/paddlenlp/transformers/ernie_gram/modeling.py
+++ b/paddlenlp/transformers/ernie_gram/modeling.py
@@ -70,7 +70,7 @@ def forward(
         if inputs_embeds is None:
             inputs_embeds = self.word_embeddings(input_ids)
 
-        input_shape = paddle.shape(inputs_embeds)[:-1]
+        input_shape = inputs_embeds.shape[:-1]
 
         if position_ids is None:
             # maybe need use shape op to unify static graph and dynamic graph
@@ -556,7 +556,7 @@ def forward(
             if start_positions.ndim > 1:
                 end_positions = end_positions.squeeze(-1)
             # sometimes the start/end positions are outside our model inputs, we ignore these terms
-            ignored_index = paddle.shape(start_logits)[1]
+            ignored_index = start_logits.shape[1]
             start_positions = start_positions.clip(0, ignored_index)
             end_positions = end_positions.clip(0, ignored_index)
 
diff --git a/paddlenlp/transformers/ernie_layout/modeling.py b/paddlenlp/transformers/ernie_layout/modeling.py
index bef6be324175..fde6f36bce2a 100644
--- a/paddlenlp/transformers/ernie_layout/modeling.py
+++ b/paddlenlp/transformers/ernie_layout/modeling.py
@@ -230,7 +230,7 @@ def __init__(self, config):
         self.dropout = nn.Dropout(config["attention_probs_dropout_prob"])
 
     def transpose_for_scores(self, x):
-        x = x.reshape([paddle.shape(x)[0], paddle.shape(x)[1], self.num_attention_heads, self.attention_head_size])
+        x = x.reshape([x.shape[0], x.shape[1], self.num_attention_heads, self.attention_head_size])
         return x.transpose([0, 2, 1, 3])
 
     def compute_qkv(self, hidden_states):
@@ -268,7 +268,7 @@ def forward(
             attention_scores += rel_2d_pos
         bool_attention_mask = attention_mask.astype(paddle.bool)
         bool_attention_mask.stop_gradient = True
-        attention_scores_shape = paddle.shape(attention_scores)
+        attention_scores_shape = attention_scores.shape
         attention_scores = paddle.where(
             bool_attention_mask.expand(attention_scores_shape),
             paddle.ones(attention_scores_shape) * float("-1e10"),
@@ -280,9 +280,7 @@ def forward(
         attention_probs = self.dropout(attention_probs)
         context_layer = paddle.matmul(attention_probs, value_layer)
         context_layer = context_layer.transpose([0, 2, 1, 3])
-        context_layer = context_layer.reshape(
-            [paddle.shape(context_layer)[0], paddle.shape(context_layer)[1], self.all_head_size]
-        )
+        context_layer = context_layer.reshape([context_layer.shape[0], context_layer.shape[1], self.all_head_size])
 
         if output_attentions:
             outputs = [context_layer, attention_probs]
@@ -689,7 +687,7 @@ def _calc_visual_bbox(self, image_feature_pool_shape, bbox, visual_shape):
                 visual_bbox_y[1:].expand(expand_shape[::-1]).transpose([1, 0]),
             ],
             axis=-1,
-        ).reshape([expand_shape[0] * expand_shape[1], paddle.shape(bbox)[-1]])
+        ).reshape([expand_shape[0] * expand_shape[1], bbox.shape[-1]])
 
         visual_bbox = visual_bbox.expand([visual_shape[0], visual_bbox.shape[0], visual_bbox.shape[1]])
         return visual_bbox
@@ -737,7 +735,7 @@ def forward(
         output_hidden_states=False,
         output_attentions=False,
     ):
-        input_shape = paddle.shape(input_ids)
+        input_shape = input_ids.shape
         visual_shape = list(input_shape)
         visual_shape[1] = self.config["image_feature_pool_shape"][0] * self.config["image_feature_pool_shape"][1]
         visual_bbox = self._calc_visual_bbox(self.config["image_feature_pool_shape"], bbox, visual_shape)
@@ -844,7 +842,7 @@ def forward(
         head_mask=None,
         labels=None,
     ):
-        input_shape = paddle.shape(input_ids)
+        input_shape = input_ids.shape
         visual_shape = list(input_shape)
         visual_shape[1] = (
             self.ernie_layout.config["image_feature_pool_shape"][0]
@@ -1040,7 +1038,7 @@ def forward(
             position_ids=position_ids,
             head_mask=head_mask,
         )
-        seq_length = paddle.shape(input_ids)[1]
+        seq_length = input_ids.shape[1]
         sequence_output = outputs[0][:, :seq_length]
         sequence_output = self.dropout(sequence_output)
         logits = self.classifier(sequence_output)
@@ -1117,7 +1115,7 @@ def forward(
             position_ids=position_ids,
             head_mask=head_mask,
         )
-        seq_length = paddle.shape(input_ids)[1]
+        seq_length = input_ids.shape[1]
         sequence_output = outputs[0][:, :seq_length]
         sequence_output = self.dropout(sequence_output)
 
@@ -1174,7 +1172,7 @@ def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_m
             bbox=bbox,
             image=image,
         )
-        seq_length = paddle.shape(input_ids)[1]
+        seq_length = input_ids.shape[1]
         sequence_output = sequence_output[:, :seq_length]
         start_logits = self.linear_start(sequence_output)
         start_logits = paddle.squeeze(start_logits, -1)
diff --git a/paddlenlp/transformers/ernie_m/modeling.py b/paddlenlp/transformers/ernie_m/modeling.py
index 9b7e89de9284..aead16f86cc5 100644
--- a/paddlenlp/transformers/ernie_m/modeling.py
+++ b/paddlenlp/transformers/ernie_m/modeling.py
@@ -71,7 +71,7 @@ def forward(
             inputs_embeds = self.word_embeddings(input_ids)
 
         if position_ids is None:
-            input_shape = paddle.shape(inputs_embeds)[:-1]
+            input_shape = inputs_embeds.shape[:-1]
             # maybe need use shape op to unify static graph and dynamic graph
             ones = paddle.ones(input_shape, dtype="int64")
             seq_length = paddle.cumsum(ones, axis=1)
@@ -556,7 +556,7 @@ def forward(
             if start_positions.ndim > 1:
                 end_positions = end_positions.squeeze(-1)
             # sometimes the start/end positions are outside our model inputs, we ignore these terms
-            ignored_index = paddle.shape(start_logits)[1]
+            ignored_index = start_logits.shape[1]
             start_positions = start_positions.clip(0, ignored_index)
             end_positions = end_positions.clip(0, ignored_index)
 
diff --git a/paddlenlp/transformers/gau_alpha/modeling.py b/paddlenlp/transformers/gau_alpha/modeling.py
index cb9d02ac755b..4a4ab981f0d2 100644
--- a/paddlenlp/transformers/gau_alpha/modeling.py
+++ b/paddlenlp/transformers/gau_alpha/modeling.py
@@ -58,7 +58,7 @@ def attention_normalize(a, mask=None, axis=-1, method="softmax"):
         if mask is not None:
             l = mask.sum(-1, keepdim=True)
         else:
-            l = paddle.ones_like(a) * paddle.shape(a)[-2]
+            l = paddle.ones_like(a) * a.shape[-2]
         if method == "squared_relu":
             return F.relu(a) ** 2 / l
         elif method == "softmax_plus":
@@ -173,7 +173,7 @@ def initializer(tensor, num_hidden_layers=12, order=2, gain=1.0):
     """
     https://github.com/bojone/bert4keras/blob/5572ed481a14f5a62be7107e3846c88a5d6b617d/bert4keras/models.py#L1226-L1235
     """
-    shape = paddle.shape(tensor)
+    shape = tensor.shape
     if shape[0] > 10000 or shape[0] < 10:
         hidden_size = shape[1]
     else:
@@ -201,7 +201,7 @@ def __init__(self, config: GAUAlphaConfig):
 
     def forward(self, x, offset=0):
         # x shape [batch_size, seqlen, dim]
-        seqlen = paddle.shape(x)[-2]
+        seqlen = x.shape[-2]
         sin, cos = (
             self.sin[offset : offset + seqlen, :],
             self.cos[offset : offset + seqlen, :],
@@ -706,13 +706,13 @@ def forward(self, input_ids, token_type_ids=None, attention_mask=None):
 
         """
         # input_ids: [bs, num_choice, seq_l]
-        input_ids = input_ids.reshape(shape=(-1, paddle.shape(input_ids)[-1]))  # flat_input_ids: [bs*num_choice,seq_l]
+        input_ids = input_ids.reshape(shape=(-1, input_ids.shape[-1]))  # flat_input_ids: [bs*num_choice,seq_l]
 
         if token_type_ids is not None:
-            token_type_ids = token_type_ids.reshape(shape=(-1, paddle.shape(token_type_ids)[-1]))
+            token_type_ids = token_type_ids.reshape(shape=(-1, token_type_ids.shape[-1]))
 
         if attention_mask is not None:
-            attention_mask = attention_mask.reshape(shape=(-1, paddle.shape(attention_mask)[-1]))
+            attention_mask = attention_mask.reshape(shape=(-1, attention_mask.shape[-1]))
 
         sequence_output = self.gau_alpha(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
 
diff --git a/paddlenlp/transformers/gpt/modeling.py b/paddlenlp/transformers/gpt/modeling.py
index 50cfc892d336..1bca9dcbfbc0 100644
--- a/paddlenlp/transformers/gpt/modeling.py
+++ b/paddlenlp/transformers/gpt/modeling.py
@@ -733,10 +733,10 @@ def __init__(
 
     def forward(self, input_ids, position_ids=None, inputs_embeddings=None):
         if input_ids is not None:
-            input_shape = paddle.shape(input_ids)
+            input_shape = input_ids.shape
             inputs_embeddings = self.word_embeddings(input_ids)
         else:
-            input_shape = paddle.shape(inputs_embeddings)[:-1]
+            input_shape = inputs_embeddings.shape[:-1]
 
         if position_ids is None:
             ones = paddle.ones(input_shape, dtype="int64")
@@ -1167,10 +1167,10 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
-            input_shape = paddle.shape(input_ids)
+            input_shape = input_ids.shape
             input_ids = input_ids.reshape((-1, input_shape[-1]))
         elif inputs_embeds is not None:
-            input_shape = paddle.shape(inputs_embeds)[:-1]
+            input_shape = inputs_embeds.shape[:-1]
         else:
             raise ValueError("You have to specify either input_ids or inputs_embeds")
         # input_shape => bs, seq_len
@@ -1182,7 +1182,7 @@ def forward(
             past_length = 0
             if past_key_values[0] is not None:
                 # bs, seq_len, num_head, head_dim
-                past_length = paddle.shape(past_key_values[0][0])[1]
+                past_length = past_key_values[0][0].shape[1]
             position_ids = paddle.arange(past_length, input_shape[-1] + past_length, dtype="int64")
             position_ids = position_ids.unsqueeze(0)
             position_ids = paddle.expand(position_ids, input_shape)
@@ -1193,7 +1193,7 @@ def forward(
         # TODO, use registered buffer
         length = input_shape[-1]
         if past_key_values[0] is not None:
-            cache_length = paddle.shape(past_key_values[0][0])[1]
+            cache_length = past_key_values[0][0].shape[1]
             length = length + cache_length
         else:
             cache_length = 0
@@ -1800,16 +1800,14 @@ def forward(
         if input_ids is not None:
             sequence_lengths = (input_ids != eos_token_id).astype("int64").sum(axis=-1) - 1
         else:
-            inputs_shape = paddle.shape(inputs_embeds)[:-1]
+            inputs_shape = inputs_embeds.shape[:-1]
             sequence_lengths = paddle.ones(inputs_shape[:-1], dtype="int64") * (inputs_shape[1] - 1)
             logger.warning(
                 f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
                 "unexpected if using padding tokens in conjunction with `inputs_embeds.`"
             )
 
-        pooled_logits = logits.gather_nd(
-            paddle.stack([paddle.arange(paddle.shape(logits)[0]), sequence_lengths], axis=-1)
-        )
+        pooled_logits = logits.gather_nd(paddle.stack([paddle.arange(logits.shape[0]), sequence_lengths], axis=-1))
 
         loss = None
 
diff --git a/paddlenlp/transformers/gpt/modeling_auto.py b/paddlenlp/transformers/gpt/modeling_auto.py
index 255763be395f..356fcb4b8442 100644
--- a/paddlenlp/transformers/gpt/modeling_auto.py
+++ b/paddlenlp/transformers/gpt/modeling_auto.py
@@ -626,14 +626,14 @@ def forward(self, input_ids, position_ids=None, inputs_embeddings=None):
             raise ValueError("You cannot specify both `inputs_embeddings` and `position_ids`)")
 
         # if input_ids is not None:
-        #     input_shape = paddle.shape(input_ids)
+        #     input_shape = input_ids.shape
         #     inputs_embeddings = self.word_embeddings(input_ids)
 
         if input_ids is not None:
             input_shape = input_ids.shape
             inputs_embeddings = self.word_embeddings(input_ids)
         else:
-            input_shape = paddle.shape(inputs_embeddings)[:-1]
+            input_shape = inputs_embeddings.shape[:-1]
 
         if position_ids is None:
             ones = paddle.ones(input_shape, dtype="int64")
@@ -1021,7 +1021,7 @@ def forward(
             input_shape = input_ids.shape
             input_ids = input_ids.reshape((-1, input_shape[-1]))
         elif inputs_embeds is not None:
-            input_shape = paddle.shape(inputs_embeds)[:-1]
+            input_shape = inputs_embeds.shape[:-1]
         else:
             raise ValueError("You have to specify either input_ids or inputs_embeds")
         # input_shape => bs, seq_len
@@ -1033,7 +1033,7 @@ def forward(
             past_length = 0
             if past_key_values[0] is not None:
                 # bs, seq_len, num_head, head_dim
-                past_length = paddle.shape(past_key_values[0][0])[1]
+                past_length = past_key_values[0][0].shape[1]
             position_ids = paddle.arange(past_length, input_shape[-1] + past_length, dtype="int64")
             position_ids = position_ids.unsqueeze(0)
             position_ids = paddle.expand(position_ids, input_shape)
@@ -1043,7 +1043,7 @@ def forward(
         # TODO, use registered buffer
         length = input_shape[-1]
         if past_key_values[0] is not None:
-            cache_length = paddle.shape(past_key_values[0][0])[1]
+            cache_length = past_key_values[0][0].shape[1]
             length = length + cache_length
         else:
             cache_length = 0
diff --git a/paddlenlp/transformers/layoutlmv2/modeling.py b/paddlenlp/transformers/layoutlmv2/modeling.py
index ce6df9f9a2f2..83212f9fe933 100644
--- a/paddlenlp/transformers/layoutlmv2/modeling.py
+++ b/paddlenlp/transformers/layoutlmv2/modeling.py
@@ -296,7 +296,7 @@ def forward(
 
         bool_attention_mask = attention_mask.astype(paddle.bool)
         bool_attention_mask.stop_gradient = True
-        attention_scores_shape = paddle.shape(attention_scores)
+        attention_scores_shape = attention_scores.shape
         attention_scores = paddle.where(
             bool_attention_mask.expand(attention_scores_shape),
             paddle.ones(attention_scores_shape) * float("-1e10"),
@@ -711,7 +711,7 @@ def forward(
         output_hidden_states=False,
         output_attentions=False,
     ):
-        input_shape = paddle.shape(input_ids)
+        input_shape = input_ids.shape
 
         visual_shape = list(input_shape)
         visual_shape[1] = self.config.image_feature_pool_shape[0] * self.config.image_feature_pool_shape[1]
@@ -745,7 +745,7 @@ def forward(
                 visual_bbox_y[1:].expand(expand_shape[::-1]).transpose([1, 0]),
             ],
             axis=-1,
-        ).reshape([expand_shape[0] * expand_shape[1], paddle.shape(bbox)[-1]])
+        ).reshape([expand_shape[0] * expand_shape[1], bbox.shape[-1]])
         visual_bbox = visual_bbox.expand([input_shape[0], visual_bbox.shape[0], visual_bbox.shape[1]])
         final_bbox = paddle.concat([bbox, visual_bbox], axis=1)
 
diff --git a/paddlenlp/transformers/layoutxlm/modeling.py b/paddlenlp/transformers/layoutxlm/modeling.py
index a0f464416594..67a9881eec42 100644
--- a/paddlenlp/transformers/layoutxlm/modeling.py
+++ b/paddlenlp/transformers/layoutxlm/modeling.py
@@ -319,7 +319,7 @@ def forward(
             attention_scores += rel_2d_pos
         bool_attention_mask = attention_mask.astype(paddle.bool)
         bool_attention_mask.stop_gradient = True
-        attention_scores_shape = paddle.shape(attention_scores)
+        attention_scores_shape = attention_scores.shape
         attention_scores = paddle.where(
             bool_attention_mask.expand(attention_scores_shape),
             paddle.ones(attention_scores_shape) * float("-1e10"),
@@ -699,7 +699,7 @@ def _calc_visual_bbox(self, image_feature_pool_shape, bbox, visual_shape):
                 visual_bbox_y[1:].expand(expand_shape[::-1]).transpose([1, 0]),
             ],
             axis=-1,
-        ).reshape([expand_shape[0] * expand_shape[1], paddle.shape(bbox)[-1]])
+        ).reshape([expand_shape[0] * expand_shape[1], bbox.shape[-1]])
 
         visual_bbox = visual_bbox.expand([visual_shape[0], visual_bbox.shape[0], visual_bbox.shape[1]])
         return visual_bbox
@@ -763,7 +763,7 @@ def forward(
         output_hidden_states=False,
         output_attentions=False,
     ):
-        input_shape = paddle.shape(input_ids)
+        input_shape = input_ids.shape
         visual_shape = list(input_shape)
         visual_shape[1] = self.config.image_feature_pool_shape[0] * self.config.image_feature_pool_shape[1]
         visual_bbox = self._calc_visual_bbox(self.config.image_feature_pool_shape, bbox, visual_shape)
@@ -963,7 +963,7 @@ def forward(
         head_mask=None,
         labels=None,
     ):
-        input_shape = paddle.shape(input_ids)
+        input_shape = input_ids.shape
         visual_shape = list(input_shape)
         visual_shape[1] = (
             self.layoutxlm.config.image_feature_pool_shape[0] * self.layoutxlm.config.image_feature_pool_shape[1]
@@ -1146,7 +1146,7 @@ def __init__(self, hidden_size=768, hidden_dropout_prob=0.1):
         self.loss_fct = CrossEntropyLoss()
 
     def build_relation(self, relations, entities):
-        batch_size, max_seq_len = paddle.shape(entities)[:2]
+        batch_size, max_seq_len = entities.shape[:2]
         new_relations = paddle.full(
             shape=[batch_size, max_seq_len * max_seq_len, 3], fill_value=-1, dtype=relations.dtype
         )
@@ -1195,7 +1195,7 @@ def build_relation(self, relations, entities):
             relation_per_doc_label[: len(positive_relations)] = 1
             relation_per_doc = paddle.concat([reordered_relations, relation_per_doc_label], axis=1)
             assert len(relation_per_doc[:, 0]) != 0
-            new_relations[b, 0] = paddle.shape(relation_per_doc)[0].astype(new_relations.dtype)
+            new_relations[b, 0] = relation_per_doc.shape[0].astype(new_relations.dtype)
             new_relations[b, 1 : len(relation_per_doc) + 1] = relation_per_doc
             # new_relations.append(relation_per_doc)
         return new_relations, entities
@@ -1219,7 +1219,7 @@ def get_predicted_relations(self, logits, relations, entities):
         return pred_relations
 
     def forward(self, hidden_states, entities, relations):
-        batch_size, max_length, _ = paddle.shape(entities)
+        batch_size, max_length, _ = entities.shape
         relations, entities = self.build_relation(relations, entities)
         loss = 0
         all_pred_relations = paddle.full(
@@ -1257,7 +1257,7 @@ def forward(self, hidden_states, entities, relations):
             pred_relations = self.get_predicted_relations(logits, relation, entities[b])
             if len(pred_relations) > 0:
                 pred_relations = paddle.stack(pred_relations)
-                all_pred_relations[b, 0, :, :] = paddle.shape(pred_relations)[0].astype(all_pred_relations.dtype)
+                all_pred_relations[b, 0, :, :] = pred_relations.shape[0].astype(all_pred_relations.dtype)
                 all_pred_relations[b, 1 : len(pred_relations) + 1, :, :] = pred_relations
         return loss, all_pred_relations
 
diff --git a/paddlenlp/transformers/llama/modeling.py b/paddlenlp/transformers/llama/modeling.py
index d4da1b195a94..33479efb3a4b 100755
--- a/paddlenlp/transformers/llama/modeling.py
+++ b/paddlenlp/transformers/llama/modeling.py
@@ -1503,7 +1503,7 @@ def forward(
         seq_length_with_past = seq_length
         cache_length = 0
         if past_key_values[0] is not None:
-            cache_length = paddle.shape(past_key_values[0][0])[1]
+            cache_length = past_key_values[0][0].shape[1]
             seq_length_with_past += cache_length
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
diff --git a/paddlenlp/transformers/llama/modeling_auto.py b/paddlenlp/transformers/llama/modeling_auto.py
index 21635da46cca..e096dd3e70f6 100644
--- a/paddlenlp/transformers/llama/modeling_auto.py
+++ b/paddlenlp/transformers/llama/modeling_auto.py
@@ -934,7 +934,7 @@ def forward(
         seq_length_with_past = seq_length
         cache_length = 0
         if past_key_values[0] is not None:
-            cache_length = paddle.shape(past_key_values[0][0])[1]
+            cache_length = past_key_values[0][0].shape[1]
             seq_length_with_past += cache_length
 
         if inputs_embeds is None:
diff --git a/paddlenlp/transformers/llama/modeling_auto_static.py b/paddlenlp/transformers/llama/modeling_auto_static.py
index 61bf3daa2529..d9af478b808c 100644
--- a/paddlenlp/transformers/llama/modeling_auto_static.py
+++ b/paddlenlp/transformers/llama/modeling_auto_static.py
@@ -870,7 +870,7 @@ def forward(
         seq_length_with_past = seq_length
         cache_length = 0
         if past_key_values[0] is not None:
-            cache_length = paddle.shape(past_key_values[0][0])[1]
+            cache_length = past_key_values[0][0].shape[1]
             seq_length_with_past += cache_length
 
         if inputs_embeds is None:
diff --git a/paddlenlp/transformers/mbart/modeling.py b/paddlenlp/transformers/mbart/modeling.py
index d401554fde3d..1f94cfd9e570 100644
--- a/paddlenlp/transformers/mbart/modeling.py
+++ b/paddlenlp/transformers/mbart/modeling.py
@@ -60,7 +60,7 @@ def shift_tokens_right(input_ids, pad_token_id):
     """
     shifted_input_ids = input_ids.clone()
     input_flat = paddle.flatten(shifted_input_ids)
-    batch_size, seq_length = paddle.shape(shifted_input_ids)
+    batch_size, seq_length = shifted_input_ids.shape
     index = paddle.arange(0, batch_size, 1, dtype="int32") * seq_length
     index_of_eos = paddle.cast(shifted_input_ids != pad_token_id, dtype="int32").sum(axis=-1) - 1
     decoder_start_tokens = paddle.gather(input_flat, index + index_of_eos)
@@ -194,9 +194,9 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
-            input_shape = paddle.shape(input_ids)
+            input_shape = input_ids.shape
         elif inputs_embeds is not None:
-            input_shape = paddle.shape(inputs_embeds)[:-1]
+            input_shape = inputs_embeds.shape[:-1]
         else:
             raise ValueError("You have to specify either input_ids or inputs_embeds")
 
@@ -312,10 +312,10 @@ def forward(
         if decoder_input_ids is not None and decoder_inputs_embeds is not None:
             raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
         elif decoder_input_ids is not None:
-            decoder_input_shape = paddle.shape(decoder_input_ids)
+            decoder_input_shape = decoder_input_ids.shape
             decoder_input_ids = decoder_input_ids.reshape((-1, decoder_input_shape[-1]))
         elif decoder_inputs_embeds is not None:
-            decoder_input_shape = paddle.shape(decoder_inputs_embeds)[:-1]
+            decoder_input_shape = decoder_inputs_embeds.shape[:-1]
         else:
             raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
 
@@ -328,7 +328,7 @@ def forward(
         if decoder_inputs_embeds is None:
             decoder_inputs_embeds = self.embed_tokens(decoder_input_ids) * self.embed_scale
 
-        past_key_values_length = paddle.shape(cache[0][0].k)[2] if cache is not None else 0
+        past_key_values_length = cache[0][0].k.shape[2] if cache is not None else 0
         decoder_inputs_embed_pos = self.decoder_embed_positions(decoder_input_shape, past_key_values_length)
 
         hidden_states = decoder_inputs_embeds + decoder_inputs_embed_pos
@@ -730,7 +730,7 @@ def forward(
             return_dict=return_dict,
         )
         output = outputs[0]
-        output_shape = paddle.shape(output)
+        output_shape = output.shape
         if input_ids is not None:
             eos_mask = paddle.cast(input_ids == self.mbart.config.eos_token_id, dtype="int64")
             if len(paddle.unique(paddle.sum(eos_mask, axis=1))) > 1:
@@ -918,7 +918,7 @@ def forward(
             if start_positions.ndim > 1:
                 end_positions = end_positions.squeeze(-1)
             # sometimes the start/end positions are outside our model inputs, we ignore these terms
-            ignored_index = paddle.shape(start_logits)[1]
+            ignored_index = start_logits.shape[1]
             start_positions = start_positions.clip(0, ignored_index)
             end_positions = end_positions.clip(0, ignored_index)
 
diff --git a/paddlenlp/transformers/minigpt4/modeling.py b/paddlenlp/transformers/minigpt4/modeling.py
index df100125d432..c64d49b5ab25 100644
--- a/paddlenlp/transformers/minigpt4/modeling.py
+++ b/paddlenlp/transformers/minigpt4/modeling.py
@@ -203,7 +203,7 @@ def forward(self, pixel_values: paddle.Tensor) -> paddle.Tensor:
         batch_size = pixel_values.shape[0]
         target_dtype = self.patch_embedding.weight.dtype
         patch_embeds = self.patch_embedding(pixel_values)  # shape = [*, width, grid, grid]
-        patch_embeds_shape = paddle.shape(patch_embeds)
+        patch_embeds_shape = patch_embeds.shape
         patch_embeds = paddle.reshape(
             patch_embeds, shape=[patch_embeds_shape[0], patch_embeds_shape[1], -1]
         ).transpose([0, 2, 1])
diff --git a/paddlenlp/transformers/mixtral/modeling.py b/paddlenlp/transformers/mixtral/modeling.py
index 592f9a47847a..43db27261606 100644
--- a/paddlenlp/transformers/mixtral/modeling.py
+++ b/paddlenlp/transformers/mixtral/modeling.py
@@ -1183,7 +1183,7 @@ def forward(
         seq_length_with_past = seq_length
         cache_length = 0
         if past_key_values[0] is not None:
-            cache_length = paddle.shape(past_key_values[0][0])[1]
+            cache_length = past_key_values[0][0].shape[1]
             seq_length_with_past += cache_length
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
diff --git a/paddlenlp/transformers/mobilebert/modeling.py b/paddlenlp/transformers/mobilebert/modeling.py
index af7cb52ba5ae..f2c0a086172a 100644
--- a/paddlenlp/transformers/mobilebert/modeling.py
+++ b/paddlenlp/transformers/mobilebert/modeling.py
@@ -674,7 +674,7 @@ def forward(
             if labels is not None:
                 loss_fct = paddle.nn.CrossEntropyLoss()
                 total_loss = loss_fct(
-                    prediction_scores.reshape((-1, paddle.shape(prediction_scores)[-1])), labels.reshape((-1,))
+                    prediction_scores.reshape((-1, prediction_scores.shape[-1])), labels.reshape((-1,))
                 )
 
             if not return_dict:
@@ -1173,7 +1173,7 @@ def forward(
             if start_positions.ndim > 1:
                 end_positions = end_positions.squeeze(-1)
             # sometimes the start/end positions are outside our model inputs, we ignore these terms
-            ignored_index = paddle.shape(start_logits)[1]
+            ignored_index = start_logits.shape[1]
             start_positions = start_positions.clip(0, ignored_index)
             end_positions = end_positions.clip(0, ignored_index)
 
diff --git a/paddlenlp/transformers/mt5/modeling.py b/paddlenlp/transformers/mt5/modeling.py
index 2defa4717912..a07079746fc5 100644
--- a/paddlenlp/transformers/mt5/modeling.py
+++ b/paddlenlp/transformers/mt5/modeling.py
@@ -286,15 +286,15 @@ def forward(
         # Input is (batch_size, seq_length, dim)
         # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length)
         # cache[0] is (batch_size, n_heads, q_len - 1, dim_per_head)
-        batch_size, seq_length = paddle.shape(hidden_states)[:2]
+        batch_size, seq_length = hidden_states.shape[:2]
 
         real_seq_length = seq_length
 
         if cache is not None:
             assert len(cache) == 2, f"cache should have 2 past states: keys and values. Got { len(cache)} past states"
-            real_seq_length += paddle.shape(cache[0])[2] if query_length is None else query_length
+            real_seq_length += cache[0].shape[2] if query_length is None else query_length
 
-        key_length = real_seq_length if key_value_states is None else paddle.shape(key_value_states)[1]
+        key_length = real_seq_length if key_value_states is None else key_value_states.shape[1]
 
         def shape(states):
             """projection"""
@@ -361,7 +361,7 @@ def project(hidden_states, proj_layer, key_value_states, cache):
             # if key and values are already calculated
             # we want only the last query position bias
             if cache is not None:
-                position_bias = position_bias[:, :, -paddle.shape(hidden_states)[1] :, :]
+                position_bias = position_bias[:, :, -hidden_states.shape[1] :, :]
 
             if mask is not None:
                 position_bias = position_bias + mask  # (batch_size, n_heads, seq_length, key_length)
@@ -514,7 +514,7 @@ def forward(
             # the actual query length is unknown for cross attention
             # if using past key value states. Need to inject it here
             if present_key_value_state is not None:
-                query_length = paddle.shape(present_key_value_state[0])[2]
+                query_length = present_key_value_state[0].shape[2]
             else:
                 query_length = None
 
@@ -875,10 +875,10 @@ def forward(
                 f"You cannot specify both {err_msg_prefix}input_ids and {err_msg_prefix}inputs_embeds at the same time"
             )
         elif input_ids is not None:
-            input_shape = paddle.shape(input_ids)
+            input_shape = input_ids.shape
             # input_ids = input_ids.reshape(shape=[-1, input_shape[-1]])
         elif inputs_embeds is not None:
-            input_shape = paddle.shape(inputs_embeds)[:-1]
+            input_shape = inputs_embeds.shape[:-1]
         else:
             err_msg_prefix = "decoder_" if self.is_decoder else ""
             raise ValueError(f"You have to specify either {err_msg_prefix}input_ids or {err_msg_prefix}inputs_embeds")
@@ -890,7 +890,7 @@ def forward(
         batch_size, seq_length = input_shape
 
         # required mask seq length can be calculated via length of past
-        mask_seq_length = paddle.shape(cache[0][0])[2] + seq_length if cache is not None else seq_length
+        mask_seq_length = cache[0][0].shape[2] + seq_length if cache is not None else seq_length
 
         if use_cache is True:
             assert self.is_decoder, f"`use_cache` can only be set to `True` if {self.__class__} is used as a decoder"
@@ -898,7 +898,7 @@ def forward(
         if attention_mask is None:
             attention_mask = paddle.ones(shape=[batch_size, mask_seq_length])
         if self.is_decoder and encoder_attention_mask is None and encoder_hidden_states is not None:
-            encoder_seq_length = paddle.shape(encoder_hidden_states)[1]
+            encoder_seq_length = encoder_hidden_states.shape[1]
             encoder_attention_mask = paddle.ones([batch_size, encoder_seq_length], dtype=paddle.int64)
 
         # initialize caches with `None` if past does not exist
@@ -912,7 +912,7 @@ def forward(
         # If a 2D or 3D attention mask is provided for the cross-attention
         # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
         if self.is_decoder and encoder_hidden_states is not None:
-            encoder_batch_size, encoder_sequence_length, _ = paddle.shape(encoder_hidden_states)
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.shape
             encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
             if encoder_attention_mask is None:
                 encoder_attention_mask = paddle.ones(shape=encoder_hidden_shape)
diff --git a/paddlenlp/transformers/nezha/modeling.py b/paddlenlp/transformers/nezha/modeling.py
index dcc3e98fd649..7e078f3d5748 100644
--- a/paddlenlp/transformers/nezha/modeling.py
+++ b/paddlenlp/transformers/nezha/modeling.py
@@ -234,7 +234,7 @@ def forward(
         if input_ids is not None:
             inputs_embeds = self.word_embeddings(input_ids)
 
-        input_shape = paddle.shape(inputs_embeds)[:-1]
+        input_shape = inputs_embeds.shape[:-1]
 
         ones = paddle.ones(input_shape, dtype="int64")
         seq_length = paddle.cumsum(ones, axis=1)
@@ -823,7 +823,7 @@ def forward(
             if end_positions.ndim > 1:
                 end_positions = end_positions.squeeze(-1)
             # sometimes the start/end positions are outside our model inputs, we ignore these terms
-            ignored_index = paddle.shape(start_logits)[1]
+            ignored_index = start_logits.shape[1]
             start_positions = start_positions.clip(0, ignored_index)
             end_positions = end_positions.clip(0, ignored_index)
 
diff --git a/paddlenlp/transformers/opt/modeling.py b/paddlenlp/transformers/opt/modeling.py
index c9217f316415..41cc45482004 100644
--- a/paddlenlp/transformers/opt/modeling.py
+++ b/paddlenlp/transformers/opt/modeling.py
@@ -889,15 +889,15 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
-            input_shape = paddle.shape(input_ids)
+            input_shape = input_ids.shape
             input_ids = input_ids.reshape((-1, input_shape[-1]))
         elif inputs_embeds is not None:
-            input_shape = paddle.shape(inputs_embeds)[:-1]
+            input_shape = inputs_embeds.shape[:-1]
         else:
             raise ValueError("You have to specify either input_ids or inputs_embeds")
 
         self.checkpoints = []
-        past_key_values_length = paddle.shape(cache[0].k)[2] if cache is not None else 0
+        past_key_values_length = cache[0].k.shape[2] if cache is not None else 0
 
         seq_length_with_past = input_shape[-1] + past_key_values_length
 
diff --git a/paddlenlp/transformers/pegasus/modeling.py b/paddlenlp/transformers/pegasus/modeling.py
index 630b75549272..406f703e0c9b 100644
--- a/paddlenlp/transformers/pegasus/modeling.py
+++ b/paddlenlp/transformers/pegasus/modeling.py
@@ -183,7 +183,7 @@ def forward(self, input_ids: Optional[Tensor] = None, attention_mask: Optional[T
         if input_ids is None:
             raise ValueError("Input_ids cannot be None.")
         inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
-        inputs_embed_pos = self.encoder_embed_positions(paddle.shape(input_ids))
+        inputs_embed_pos = self.encoder_embed_positions(input_ids.shape)
         hidden_states = inputs_embeds + inputs_embed_pos
         encoder_input = self.encoder_dropout(hidden_states)
 
@@ -274,7 +274,7 @@ def forward(
 
         """
         if decoder_attention_mask is None:
-            decoder_length = paddle.shape(decoder_input_ids)[-1]
+            decoder_length = decoder_input_ids.shape[-1]
             decoder_attention_mask = paddle.tensor.triu(
                 (paddle.full((decoder_length, decoder_length), -np.inf, dtype=paddle.get_default_dtype())), 1
             )
@@ -286,10 +286,8 @@ def forward(
                 decoder_input_ids
             ) * self.embed_scale * mix_ratio + self.embed_scale * x * (1 - mix_ratio)
 
-        past_key_values_length = paddle.shape(cache[0][0].k)[2] if cache is not None else 0
-        decoder_inputs_embed_pos = self.decoder_embed_positions(
-            paddle.shape(decoder_input_ids), past_key_values_length
-        )
+        past_key_values_length = cache[0][0].k.shape[2] if cache is not None else 0
+        decoder_inputs_embed_pos = self.decoder_embed_positions(decoder_input_ids.shape, past_key_values_length)
         hidden_states = decoder_inputs_embeds + decoder_inputs_embed_pos
         decoder_input = self.decoder_dropout(hidden_states)
 
diff --git a/paddlenlp/transformers/reformer/modeling.py b/paddlenlp/transformers/reformer/modeling.py
index 8d4f784cbac5..f94d220d5ee9 100644
--- a/paddlenlp/transformers/reformer/modeling.py
+++ b/paddlenlp/transformers/reformer/modeling.py
@@ -464,10 +464,10 @@ def forward(
     ):
 
         if input_ids is not None:
-            input_shape = paddle.shape(input_ids)
+            input_shape = input_ids.shape
             inputs_embeds = self.word_embeddings(input_ids)
         else:
-            input_shape = paddle.shape(inputs_embeds)[:-1]
+            input_shape = inputs_embeds.shape[:-1]
 
         if position_ids is None:
             ones = paddle.ones(input_shape, dtype="int64")
diff --git a/paddlenlp/transformers/roberta/modeling.py b/paddlenlp/transformers/roberta/modeling.py
index 23c7dce1da36..e6f42c582996 100644
--- a/paddlenlp/transformers/roberta/modeling.py
+++ b/paddlenlp/transformers/roberta/modeling.py
@@ -101,7 +101,7 @@ def forward(
             position_ids.stop_gradient = True
 
         if token_type_ids is None:
-            input_shape = paddle.shape(inputs_embeds)[:-1]
+            input_shape = inputs_embeds.shape[:-1]
             token_type_ids = paddle.zeros(input_shape, dtype="int64")
 
         position_embeddings = self.position_embeddings(position_ids)
@@ -119,7 +119,7 @@ def create_position_ids_from_inputs_embeds(self, inputs_embeds):
             input_shape: paddle.Tensor
         Returns: paddle.Tensor
         """
-        input_shape = paddle.shape(inputs_embeds)[:-1]
+        input_shape = inputs_embeds.shape[:-1]
         sequence_length = input_shape[1]
 
         position_ids = paddle.arange(self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype="int64")
@@ -643,7 +643,7 @@ def forward(
             if start_positions.ndim > 1:
                 end_positions = end_positions.squeeze(-1)
             # sometimes the start/end positions are outside our model inputs, we ignore these terms
-            ignored_index = paddle.shape(start_logits)[1]
+            ignored_index = start_logits.shape[1]
             start_positions = start_positions.clip(0, ignored_index)
             end_positions = end_positions.clip(0, ignored_index)
 
@@ -1049,9 +1049,9 @@ def forward(
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         if input_ids is not None:
-            num_choices = paddle.shape(input_ids)[1]
+            num_choices = input_ids.shape[1]
         elif inputs_embeds is not None:
-            num_choices = paddle.shape(inputs_embeds)[1]
+            num_choices = inputs_embeds.shape[1]
 
         input_ids = input_ids.reshape((-1, input_ids.shape[-1])) if input_ids is not None else None
         inputs_embeds = (
diff --git a/paddlenlp/transformers/roformer/modeling.py b/paddlenlp/transformers/roformer/modeling.py
index d594171f9537..95c6cbbd493b 100644
--- a/paddlenlp/transformers/roformer/modeling.py
+++ b/paddlenlp/transformers/roformer/modeling.py
@@ -75,7 +75,7 @@ def forward(self, input_ids=None, token_type_ids=None, inputs_embeds=None):
             inputs_embeds = self.word_embeddings(input_ids)
 
         if token_type_ids is None:
-            token_type_ids_shape = paddle.shape(inputs_embeds)[:-1]
+            token_type_ids_shape = inputs_embeds.shape[:-1]
             token_type_ids = paddle.zeros(token_type_ids_shape, dtype="int64")
 
         token_type_embeddings = self.token_type_embeddings(token_type_ids)
@@ -97,7 +97,7 @@ def __init__(self, dim, max_position_embeddings=512):
 
     def forward(self, x, offset=0):
         # x shape [batch_size, num_heads, seqlen, head_dim]
-        seqlen = paddle.shape(x)[-2]
+        seqlen = x.shape[-2]
         sin, cos = (
             self.sin[offset : offset + seqlen, :],
             self.cos[offset : offset + seqlen, :],
@@ -683,7 +683,7 @@ def forward(
             if start_positions.ndim > 1:
                 end_positions = end_positions.squeeze(-1)
             # sometimes the start/end positions are outside our model inputs, we ignore these terms
-            ignored_index = paddle.shape(start_logits)[1]
+            ignored_index = start_logits.shape[1]
             start_positions = start_positions.clip(0, ignored_index)
             end_positions = end_positions.clip(0, ignored_index)
 
diff --git a/paddlenlp/transformers/roformerv2/modeling.py b/paddlenlp/transformers/roformerv2/modeling.py
index 105a013e46ca..e727109ab3a1 100644
--- a/paddlenlp/transformers/roformerv2/modeling.py
+++ b/paddlenlp/transformers/roformerv2/modeling.py
@@ -50,7 +50,7 @@ def initializer(tensor, num_hidden_layers=12, order=2, gain=1.0):
     """
     https://github.com/bojone/bert4keras/blob/5572ed481a14f5a62be7107e3846c88a5d6b617d/bert4keras/models.py#L1226-L1235
     """
-    shape = paddle.shape(tensor)
+    shape = tensor.shape
     if shape[0] > 10000 or shape[0] < 10:
         hidden_size = shape[1]
     else:
@@ -82,7 +82,7 @@ def __init__(self, dim, max_position_embeddings=512):
 
     def forward(self, x, offset=0):
         # x shape [batch_size, num_heads, seqlen, head_dim]
-        seqlen = paddle.shape(x)[-2]
+        seqlen = x.shape[-2]
         sin, cos = (
             self.sin[offset : offset + seqlen, :],
             self.cos[offset : offset + seqlen, :],
@@ -706,13 +706,13 @@ def forward(self, input_ids, token_type_ids=None, attention_mask=None):
 
         """
         # input_ids: [bs, num_choice, seq_l]
-        input_ids = input_ids.reshape(shape=(-1, paddle.shape(input_ids)[-1]))  # flat_input_ids: [bs*num_choice,seq_l]
+        input_ids = input_ids.reshape(shape=(-1, input_ids.shape[-1]))  # flat_input_ids: [bs*num_choice,seq_l]
 
         if token_type_ids is not None:
-            token_type_ids = token_type_ids.reshape(shape=(-1, paddle.shape(token_type_ids)[-1]))
+            token_type_ids = token_type_ids.reshape(shape=(-1, token_type_ids.shape[-1]))
 
         if attention_mask is not None:
-            attention_mask = attention_mask.reshape(shape=(-1, paddle.shape(attention_mask)[-1]))
+            attention_mask = attention_mask.reshape(shape=(-1, attention_mask.shape[-1]))
 
         sequence_output = self.roformerv2(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
 
diff --git a/paddlenlp/transformers/skep/modeling.py b/paddlenlp/transformers/skep/modeling.py
index 2a5c2b544760..ce4da4bff873 100644
--- a/paddlenlp/transformers/skep/modeling.py
+++ b/paddlenlp/transformers/skep/modeling.py
@@ -77,7 +77,7 @@ def forward(
             inputs_embeds = self.word_embeddings(input_ids)
 
         if position_ids is None:
-            input_shape = paddle.shape(inputs_embeds)[:-1]
+            input_shape = inputs_embeds.shape[:-1]
             # maybe need use shape op to unify static graph and dynamic graph
             ones = paddle.ones(input_shape, dtype="int64")
             seq_length = paddle.cumsum(ones, axis=1)
@@ -92,7 +92,7 @@ def forward(
         embeddings = inputs_embeds + position_embeddings
         if self.type_vocab_size != 0:
             if token_type_ids is None:
-                token_type_ids_shape = paddle.shape(inputs_embeds)[:-1]
+                token_type_ids_shape = inputs_embeds.shape[:-1]
                 token_type_ids = paddle.zeros(token_type_ids_shape, dtype="int64")
             token_type_embeddings = self.token_type_embeddings(token_type_ids)
             embeddings += token_type_embeddings
@@ -330,7 +330,7 @@ def forward(
                 axis=[1, 2],
             )
             if past_key_values is not None:
-                batch_size = paddle.shape(past_key_values[0][0])[0]
+                batch_size = past_key_values[0][0].shape[0]
 
                 past_mask = paddle.zeros([batch_size, 1, 1, past_key_values_length], dtype=attention_mask.dtype)
                 attention_mask = paddle.concat([past_mask, attention_mask], axis=-1)
@@ -737,7 +737,7 @@ def forward(
             if attention_mask is not None:
                 seq_lens = paddle.sum(attention_mask, axis=1, dtype="int64")
             else:
-                input_ids_shape = paddle.shape(input_ids)
+                input_ids_shape = input_ids.shape
                 seq_lens = paddle.ones(shape=[input_ids_shape[0]], dtype="int64") * input_ids_shape[1]
 
         loss, prediction = None, None
diff --git a/paddlenlp/transformers/t5/modeling.py b/paddlenlp/transformers/t5/modeling.py
index bc27b4c91d57..596276a522f4 100644
--- a/paddlenlp/transformers/t5/modeling.py
+++ b/paddlenlp/transformers/t5/modeling.py
@@ -337,15 +337,15 @@ def forward(
         # Input is (batch_size, seq_length, dim)
         # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length)
         # cache[0] is (batch_size, n_heads, q_len - 1, dim_per_head)
-        batch_size, seq_length = paddle.shape(hidden_states)[:2]
+        batch_size, seq_length = hidden_states.shape[:2]
 
         real_seq_length = seq_length
 
         if cache is not None:
             assert len(cache) == 2, f"cache should have 2 past states: keys and values. Got { len(cache)} past states"
-            real_seq_length += paddle.shape(cache[0])[2] if query_length is None else query_length
+            real_seq_length += cache[0].shape[2] if query_length is None else query_length
 
-        key_length = real_seq_length if key_value_states is None else paddle.shape(key_value_states)[1]
+        key_length = real_seq_length if key_value_states is None else key_value_states.shape[1]
 
         def shape(states):
             """projection"""
@@ -412,7 +412,7 @@ def project(hidden_states, proj_layer, key_value_states, cache):
             # if key and values are already calculated
             # we want only the last query position bias
             if cache is not None:
-                position_bias = position_bias[:, :, -paddle.shape(hidden_states)[1] :, :]
+                position_bias = position_bias[:, :, -hidden_states.shape[1] :, :]
 
             if mask is not None:
                 position_bias = position_bias + mask  # (batch_size, n_heads, seq_length, key_length)
@@ -566,7 +566,7 @@ def forward(
             # the actual query length is unknown for cross attention
             # if using past key value states. Need to inject it here
             if present_key_value_state is not None:
-                query_length = paddle.shape(present_key_value_state[0])[2]
+                query_length = present_key_value_state[0].shape[2]
             else:
                 query_length = None
 
@@ -990,10 +990,10 @@ def forward(
                 f"You cannot specify both {err_msg_prefix}input_ids and {err_msg_prefix}inputs_embeds at the same time"
             )
         elif input_ids is not None:
-            input_shape = paddle.shape(input_ids)
+            input_shape = input_ids.shape
             # input_ids = input_ids.reshape(shape=[-1, input_shape[-1]])
         elif inputs_embeds is not None:
-            input_shape = paddle.shape(inputs_embeds)[:-1]
+            input_shape = inputs_embeds.shape[:-1]
         else:
             err_msg_prefix = "decoder_" if self.is_decoder else ""
             raise ValueError(f"You have to specify either {err_msg_prefix}input_ids or {err_msg_prefix}inputs_embeds")
@@ -1005,7 +1005,7 @@ def forward(
         batch_size, seq_length = input_shape
 
         # required mask seq length can be calculated via length of past
-        mask_seq_length = paddle.shape(cache[0][0])[2] + seq_length if cache is not None else seq_length
+        mask_seq_length = cache[0][0].shape[2] + seq_length if cache is not None else seq_length
 
         if use_cache is True:
             assert self.is_decoder, f"`use_cache` can only be set to `True` if {self.__class__} is used as a decoder"
@@ -1013,7 +1013,7 @@ def forward(
         if attention_mask is None:
             attention_mask = paddle.ones(shape=[batch_size, mask_seq_length])
         if self.is_decoder and encoder_attention_mask is None and encoder_hidden_states is not None:
-            encoder_seq_length = paddle.shape(encoder_hidden_states)[1]
+            encoder_seq_length = encoder_hidden_states.shape[1]
             encoder_attention_mask = paddle.ones([batch_size, encoder_seq_length], dtype=paddle.int64)
 
         # initialize caches with `None` if past does not exist
@@ -1027,7 +1027,7 @@ def forward(
         # If a 2D or 3D attention mask is provided for the cross-attention
         # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
         if self.is_decoder and encoder_hidden_states is not None:
-            encoder_batch_size, encoder_sequence_length, _ = paddle.shape(encoder_hidden_states)
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.shape
             encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
             if encoder_attention_mask is None:
                 encoder_attention_mask = paddle.ones(shape=encoder_hidden_shape)
diff --git a/paddlenlp/transformers/tinybert/modeling.py b/paddlenlp/transformers/tinybert/modeling.py
index 25061d998513..b853740790af 100644
--- a/paddlenlp/transformers/tinybert/modeling.py
+++ b/paddlenlp/transformers/tinybert/modeling.py
@@ -617,7 +617,7 @@ def forward(
             if start_positions.ndim > 1:
                 end_positions = end_positions.squeeze(-1)
             # sometimes the start/end positions are outside our model inputs, we ignore these terms
-            ignored_index = paddle.shape(start_logits)[1]
+            ignored_index = start_logits.shape[1]
             start_positions = start_positions.clip(0, ignored_index)
             end_positions = end_positions.clip(0, ignored_index)
 
diff --git a/paddlenlp/transformers/transformer/modeling.py b/paddlenlp/transformers/transformer/modeling.py
index 1adb4508b386..8c61459d9659 100644
--- a/paddlenlp/transformers/transformer/modeling.py
+++ b/paddlenlp/transformers/transformer/modeling.py
@@ -298,7 +298,7 @@ def label_smoothed_nll_loss(lprobs, target, epsilon, ignore_index=None, reduce=T
     if target.dim() == lprobs.dim() - 1:
         target = target.unsqueeze(-1)
 
-    num_tokens = paddle.shape(lprobs)[0]
+    num_tokens = lprobs.shape[0]
     index = paddle.arange(0, num_tokens, dtype="int64").unsqueeze(-1)
     index = paddle.concat([index, target], axis=-1)
     index.stop_gradient = True
@@ -498,7 +498,7 @@ def _merge_batch_beams_with_var_dim(self, c):
         return c
 
     def _split_batch_beams_with_var_dim(self, c):
-        var_dim_size = paddle.shape(c)[self.var_dim_in_state]
+        var_dim_size = c.shape[self.var_dim_in_state]
         c = paddle.reshape(
             c,
             [-1, self.beam_size]
@@ -586,14 +586,14 @@ def step(self, time, inputs, states, **kwargs):
 
         if kwargs.get("trg_word", None) is not None:
             if paddle.in_dynamic_mode():
-                if paddle.shape(kwargs.get("trg_word"))[1] > time:
+                if kwargs.get("trg_word").shape[1] > time:
                     beam_search_output, beam_search_state = self.force_decoding(
                         beam_search_output, beam_search_state, kwargs.get("trg_word"), kwargs.get("trg_length"), time
                     )
             else:
 
                 def condition(trg_word, time):
-                    return paddle.shape(trg_word)[1] > time
+                    return trg_word.shape[1] > time
 
                 def default_fn(beam_search_output, beam_search_state):
                     return beam_search_output, beam_search_state
@@ -624,8 +624,8 @@ def default_fn(beam_search_output, beam_search_state):
         return (beam_search_output, beam_search_state, next_inputs, finished)
 
     def force_decoding(self, beam_search_output, beam_search_state, trg_word, trg_length, time):
-        batch_size = paddle.shape(beam_search_output.predicted_ids)[0]
-        beam_size = paddle.shape(beam_search_output.predicted_ids)[1]
+        batch_size = beam_search_output.predicted_ids.shape[0]
+        beam_size = beam_search_output.predicted_ids.shape[1]
 
         ids_dtype = beam_search_output.predicted_ids.dtype
         scores_dtype = beam_search_output.scores.dtype
@@ -842,8 +842,8 @@ def forward(self, src_word, trg_word):
                     src_word=paddle.randint(low=3, high=30000, shape=[batch_size, seq_len]),
                     trg_word=paddle.randint(low=3, high=30000, shape=[batch_size, seq_len]))
         """
-        src_max_len = paddle.shape(src_word)[-1]
-        trg_max_len = paddle.shape(trg_word)[-1]
+        src_max_len = src_word.shape[-1]
+        trg_max_len = trg_word.shape[-1]
         src_slf_attn_bias = (
             paddle.cast(src_word == self.pad_id, dtype=paddle.get_default_dtype()).unsqueeze([1, 2]) * -1e4
         )
@@ -1050,7 +1050,7 @@ def forward(self, src_word, trg_word=None):
             trg_length = None
 
         if self.beam_search_version == "v1":
-            src_max_len = paddle.shape(src_word)[-1]
+            src_max_len = src_word.shape[-1]
             src_slf_attn_bias = (
                 paddle.cast(src_word == self.pad_id, dtype=paddle.get_default_dtype()).unsqueeze([1, 2]) * -1e4
             )
@@ -1124,7 +1124,7 @@ def merge_beam_dim(tensor):
             return paddle.reshape(tensor, [shape[0] * shape[1]] + list(shape[2:]))
 
         # run encoder
-        src_max_len = paddle.shape(src_word)[-1]
+        src_max_len = src_word.shape[-1]
         src_slf_attn_bias = (
             paddle.cast(src_word == self.pad_id, dtype=paddle.get_default_dtype()).unsqueeze([1, 2]) * -1e4
         )
@@ -1252,7 +1252,7 @@ def grow_topk(i, logits, alive_seq, alive_log_probs, states):
             topk_seq = gather_2d(alive_seq, topk_coordinates, beam_size, batch_size)
             topk_seq = paddle.concat([topk_seq, paddle.reshape(topk_ids, list(topk_ids.shape[:]) + [1])], axis=2)
             states = update_states(states, topk_coordinates, beam_size, batch_size)
-            eos = paddle.full(shape=paddle.shape(topk_ids), dtype=alive_seq.dtype, fill_value=self.eos_id)
+            eos = paddle.full(shape=topk_ids.shape, dtype=alive_seq.dtype, fill_value=self.eos_id)
             topk_finished = paddle.cast(paddle.equal(topk_ids, eos), "float32")
 
             # topk_seq: [batch_size, 2*beam_size, i+1]
@@ -1320,7 +1320,7 @@ def force_decoding_v2(topk_ids, topk_scores, time):
             return topk_ids, topk_scores
 
         def inner_loop(i, pre_word, alive_seq, alive_log_probs, finished_seq, finished_scores, finished_flags, caches):
-            trg_pos = paddle.full(shape=paddle.shape(pre_word), dtype=alive_seq.dtype, fill_value=i)
+            trg_pos = paddle.full(shape=pre_word.shape, dtype=alive_seq.dtype, fill_value=i)
             trg_emb = self.trg_word_embedding(pre_word)
             trg_pos_emb = self.trg_pos_embedding(trg_pos)
             trg_emb = trg_emb + trg_pos_emb
diff --git a/paddlenlp/transformers/unified_transformer/modeling.py b/paddlenlp/transformers/unified_transformer/modeling.py
index 505baa0c86e4..fb85fc9c86e9 100644
--- a/paddlenlp/transformers/unified_transformer/modeling.py
+++ b/paddlenlp/transformers/unified_transformer/modeling.py
@@ -97,9 +97,9 @@ def forward(
         if input_ids is None and input_embeddings is None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
-            inputs_shape = paddle.shape(input_ids)
+            inputs_shape = input_ids.shape
         elif input_embeddings is not None:
-            inputs_shape = paddle.shape(input_embeddings)[:-1]
+            inputs_shape = input_embeddings.shape[:-1]
         else:
             raise ValueError("You have to specify either input_ids or inputs_embeds")
         if input_embeddings is None:
@@ -524,17 +524,14 @@ def prepare_inputs_for_generation(
 
         if position_ids is None:
             if self.pad_token_id is None:
-                position_ids = paddle.expand_as(
-                    paddle.arange(end=paddle.shape(input_ids)[1], dtype="int64"), input_ids
-                )
+                position_ids = paddle.expand_as(paddle.arange(end=input_ids.shape[1], dtype="int64"), input_ids)
             else:
                 # NOTE: If there is a unk_token_id in input_ids, the following logic is wrong.
                 # In that case, the position_ids must be provided.
                 # And this is for left padding input_ids.
                 num_pad = paddle.sum((input_ids == self.pad_token_id).astype("float32"), axis=-1, keepdim=True)
                 position_ids = F.relu(
-                    paddle.expand_as(paddle.arange(end=paddle.shape(input_ids)[1], dtype="float32"), input_ids)
-                    - num_pad
+                    paddle.expand_as(paddle.arange(end=input_ids.shape[1], dtype="float32"), input_ids) - num_pad
                 ).astype("int64")
             position_ids.stop_gradient = True
 
diff --git a/paddlenlp/transformers/unimo/modeling.py b/paddlenlp/transformers/unimo/modeling.py
index fd633e5e4a67..fc5b0389d0db 100644
--- a/paddlenlp/transformers/unimo/modeling.py
+++ b/paddlenlp/transformers/unimo/modeling.py
@@ -89,9 +89,9 @@ def forward(
         if input_ids is None and input_embeddings is None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
-            inputs_shape = paddle.shape(input_ids)
+            inputs_shape = input_ids.shape
         elif input_embeddings is not None:
-            inputs_shape = paddle.shape(input_embeddings)[:-1]
+            inputs_shape = input_embeddings.shape[:-1]
         else:
             raise ValueError("You have to specify either input_ids or inputs_embeds")
         if input_embeddings is None:
@@ -505,14 +505,11 @@ def prepare_inputs_for_generation(
 
         if position_ids is None:
             if self.pad_token_id is None:
-                position_ids = paddle.expand_as(
-                    paddle.arange(end=paddle.shape(input_ids)[1], dtype="int64"), input_ids
-                )
+                position_ids = paddle.expand_as(paddle.arange(end=input_ids.shape[1], dtype="int64"), input_ids)
             else:
                 num_pad = paddle.sum((input_ids == self.pad_token_id).astype("float32"), axis=-1, keepdim=True)
                 position_ids = F.relu(
-                    paddle.expand_as(paddle.arange(end=paddle.shape(input_ids)[1], dtype="float32"), input_ids)
-                    - num_pad
+                    paddle.expand_as(paddle.arange(end=input_ids.shape[1], dtype="float32"), input_ids) - num_pad
                 ).astype("int64")
             position_ids.stop_gradient = True
 
diff --git a/paddlenlp/transformers/xlm/modeling.py b/paddlenlp/transformers/xlm/modeling.py
index 8e28666891c0..4f8fb0b8585f 100644
--- a/paddlenlp/transformers/xlm/modeling.py
+++ b/paddlenlp/transformers/xlm/modeling.py
@@ -49,7 +49,7 @@ def __init__(self, num_embeddings, embedding_dim):
 
     @staticmethod
     def _init_weight(out):
-        n_pos, dim = paddle.shape(out)
+        n_pos, dim = out.shape
         out.stop_gradient = True
         position_ids = paddle.arange(0, n_pos, dtype=out.dtype).unsqueeze(1)
         indices = paddle.arange(0, dim // 2, dtype=out.dtype).unsqueeze(0)
@@ -75,7 +75,7 @@ def get_masks(seqlen, lengths, causal, padding_mask=None):
         mask = alen < lengths[:, None]
 
     # attention mask is the same as mask, or triangular inferior attention (causal)
-    bs = paddle.shape(lengths)[0]
+    bs = lengths.shape[0]
     if causal:
         attn_mask = paddle.tile(alen[None, None, :], (bs, seqlen, 1)) <= alen[None, :, None]
     else:
@@ -115,11 +115,11 @@ def forward(self, input, mask, kv=None, cache=None, output_attentions=False):
         """
         # Input is (bs, qlen, dim)
         # Mask is (bs, klen) (non-causal) or (bs, klen, klen)
-        bs, qlen, dim = paddle.shape(input)
+        bs, qlen, dim = input.shape
         if kv is None:
             klen = qlen if cache is None else cache["seqlen"] + qlen
         else:
-            klen = paddle.shape(kv)[1]
+            klen = kv.shape[1]
 
         mask_reshape = (bs, 1, qlen, klen) if mask.ndim == 3 else (bs, 1, 1, klen)
 
@@ -384,7 +384,7 @@ def forward(
                 last_hidden_state = model(**inputs)[0]
 
         """
-        bs, seqlen = paddle.shape(input_ids)
+        bs, seqlen = input_ids.shape
 
         if lengths is None:
             if input_ids is not None:
@@ -448,7 +448,7 @@ def forward(
 
         # update cache length
         if cache is not None:
-            cache["seqlen"] += paddle.shape(tensor)[1]
+            cache["seqlen"] += tensor.shape[1]
 
         return tuple(v for v in [tensor, hidden_states, attentions] if v is not None)
 
@@ -864,18 +864,16 @@ def forward(self, input_ids=None, langs=None, attention_mask=None, position_ids=
         """
         num_choices = input_ids.shape[1]
         # input_ids: [bs, num_choice, seqlen]
-        input_ids = input_ids.reshape(
-            shape=(-1, paddle.shape(input_ids)[-1])
-        )  # flat_input_ids: [bs*num_choice, seqlen]
+        input_ids = input_ids.reshape(shape=(-1, input_ids.shape[-1]))  # flat_input_ids: [bs*num_choice, seqlen]
 
         if langs is not None:
-            langs = langs.reshape(shape=(-1, paddle.shape(langs)[-1]))
+            langs = langs.reshape(shape=(-1, langs.shape[-1]))
 
         if attention_mask is not None:
-            attention_mask = attention_mask.reshape(shape=(-1, paddle.shape(attention_mask)[-1]))
+            attention_mask = attention_mask.reshape(shape=(-1, attention_mask.shape[-1]))
 
         if position_ids is not None:
-            position_ids = position_ids.reshape(shape=(-1, paddle.shape(position_ids)[-1]))
+            position_ids = position_ids.reshape(shape=(-1, position_ids.shape[-1]))
 
         if lengths is not None:
             lengths = lengths.reshape(shape=(-1,))
diff --git a/paddlenlp/transformers/xlnet/modeling.py b/paddlenlp/transformers/xlnet/modeling.py
index ff1cb2eefda8..608f300db3fc 100644
--- a/paddlenlp/transformers/xlnet/modeling.py
+++ b/paddlenlp/transformers/xlnet/modeling.py
@@ -74,7 +74,7 @@ def prune_heads(self, heads):
     @staticmethod
     def rel_shift_bnij(x, klen=-1):
         # Relative shift of the attention matrix from bd~ to bd (refer to Appendix B in the Transformer-XL paper)
-        x_size = paddle.shape(x)
+        x_size = x.shape
 
         x = paddle.reshape(x, [x_size[0], x_size[1], x_size[3], x_size[2]])
         x = x[:, :, 1:, :]
@@ -104,7 +104,7 @@ def rel_attn_core(
         # q_head = Exi * Wq; self.r_r_bias = v; k_head_r = Wkr * Rij
         # b = Exi * Wq * Wkr * Rij; d = v * Wkr * Rij; bd = b + d
         bd = paddle.einsum("ibnd,jbnd->bnij", q_head + self.r_r_bias, k_head_r)
-        bd = self.rel_shift_bnij(bd, klen=paddle.shape(ac)[3])
+        bd = self.rel_shift_bnij(bd, klen=ac.shape[3])
 
         # Segment based attention score
         if seg_mat is None:
@@ -139,7 +139,7 @@ def post_attention(self, h, attn_vec, residual=True):
         """Post-attention processing."""
         # Post-attention projection (back to 'd_model')
         # Compute einsum4x4("ibnd,hnd->ibh", attn_vec, self.o)
-        shape = paddle.shape(attn_vec)
+        shape = attn_vec.shape
         attn_vec = attn_vec.reshape([shape[0], shape[1], attn_vec.shape[2] * attn_vec.shape[3]])
         attn_out = paddle.einsum("ibm,hm->ibh", attn_vec, self.o)
 
@@ -174,31 +174,23 @@ def forward(
             # Content-based key head
             # Compute k_head_h = einsum4x4("ibh,h(n*d)->ibnd", cat, self.k)
             k_head_h = paddle.matmul(cat, self.k)
-            k_head_h = paddle.reshape(
-                k_head_h, shape=[paddle.shape(cat)[0], paddle.shape(cat)[1], self.n_head, self.d_head]
-            )
+            k_head_h = paddle.reshape(k_head_h, shape=[cat.shape[0], cat.shape[1], self.n_head, self.d_head])
 
             # Content-based value head
             # Compute v_head_h = einsum4x4("ibh,h(n*d)->ibnd", cat, self.v)
             v_head_h = paddle.matmul(cat, self.v)
-            v_head_h = paddle.reshape(
-                v_head_h, shape=[paddle.shape(cat)[0], paddle.shape(cat)[1], self.n_head, self.d_head]
-            )
+            v_head_h = paddle.reshape(v_head_h, shape=[cat.shape[0], cat.shape[1], self.n_head, self.d_head])
 
             # Position-based key head
             # Compute k_head_r = einsum4x4("ibh,h(n*d)->ibnd", r, self.r)
             k_head_r = paddle.matmul(r, self.r)
-            k_head_r = paddle.reshape(
-                k_head_r, shape=[paddle.shape(r)[0], paddle.shape(r)[1], self.n_head, self.d_head]
-            )
+            k_head_r = paddle.reshape(k_head_r, shape=[r.shape[0], r.shape[1], self.n_head, self.d_head])
 
             # H-stream
             # Content-stream query head
             # Compute q_head_h = einsum4x4("ibh,h(n*d)->ibnd", h, self.q)
             q_head_h = paddle.matmul(h, self.q)  # shape
-            q_head_h = paddle.reshape(
-                q_head_h, shape=[paddle.shape(h)[0], paddle.shape(h)[1], self.n_head, self.d_head]
-            )
+            q_head_h = paddle.reshape(q_head_h, shape=[h.shape[0], h.shape[1], self.n_head, self.d_head])
 
             # Core attention ops
             attn_vec_h = self.rel_attn_core(
@@ -276,26 +268,20 @@ def forward(
             # Content heads
             # Compute q_head_h = einsum4x4("ibh,hnd->ibnd", h, self.q)
             q_head_h = paddle.matmul(h, self.q)
-            q_head_h = paddle.reshape(
-                q_head_h, shape=[paddle.shape(h)[0], paddle.shape(h)[1], self.n_head, self.d_head]
-            )
+            q_head_h = paddle.reshape(q_head_h, shape=[h.shape[0], h.shape[1], self.n_head, self.d_head])
 
             # Compute k_head_h = einsum4x4("ibh,hnd->ibnd", cat, self.k)
             k_head_h = paddle.matmul(cat, self.k)
-            k_head_h = paddle.reshape(
-                k_head_h, shape=[paddle.shape(h)[0], paddle.shape(h)[1], self.n_head, self.d_head]
-            )
+            k_head_h = paddle.reshape(k_head_h, shape=[h.shape[0], h.shape[1], self.n_head, self.d_head])
 
             # Compute v_head_h = einsum4x4("ibh,hnd->ibnd", cat, self.v)
             v_head_h = paddle.matmul(cat, self.v)
-            v_head_h = paddle.reshape(
-                v_head_h, shape=[paddle.shape(h)[0], paddle.shape(h)[1], self.n_head, self.d_head]
-            )
+            v_head_h = paddle.reshape(v_head_h, shape=[h.shape[0], h.shape[1], self.n_head, self.d_head])
 
             # Position-based key head
             # Compute k_head_r = einsum4x4("ibh,hnd->ibnd", r, self.r)
             k_head_r = paddle.matmul(r, self.r)
-            k_head_r = paddle.reshape(k_head_r, shape=[paddle.shape(k_head_r)[0], -1, self.n_head, self.d_head])
+            k_head_r = paddle.reshape(k_head_r, shape=[k_head_r.shape[0], -1, self.n_head, self.d_head])
 
             # Core attention ops
             attn_vec = self.rel_attn_core(
@@ -1003,10 +989,10 @@ def forward(
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
             input_ids = paddle.transpose(input_ids, perm=[1, 0])
-            qlen, bsz = paddle.shape(input_ids)[0], paddle.shape(input_ids)[1]
+            qlen, bsz = input_ids.shape[0], input_ids.shape[1]
         elif inputs_embeds is not None:
             inputs_embeds = paddle.transpose(inputs_embeds, perm=[1, 0])
-            qlen, bsz = paddle.shape(inputs_embeds)[0], paddle.shape(inputs_embeds)[1]
+            qlen, bsz = inputs_embeds.shape[0], inputs_embeds.shape[1]
         else:
             raise ValueError("You have to specify either input_ids or inputs_embeds")
 
@@ -1016,7 +1002,7 @@ def forward(
         perm_mask = perm_mask.transpose([1, 2, 0]) if perm_mask is not None else None
         target_mapping = target_mapping.transpose([1, 2, 0]) if target_mapping is not None else None
 
-        mlen = paddle.shape(mems[0])[0] if mems is not None and mems[0] is not None else 0
+        mlen = mems[0].shape[0] if mems is not None and mems[0] is not None else 0
         klen = mlen + qlen
 
         # Attention mask
@@ -1046,7 +1032,7 @@ def forward(
         if data_mask is not None:
             # All mems can be attended to
             if mlen > 0:
-                mems_mask = paddle.cast(paddle.zeros([paddle.shape(data_mask)[0], mlen, bsz]), dtype=dtype_float)
+                mems_mask = paddle.cast(paddle.zeros([data_mask.shape[0], mlen, bsz]), dtype=dtype_float)
                 data_mask = paddle.concat([mems_mask, data_mask], axis=1)
             if attn_mask is None:
                 attn_mask = paddle.unsqueeze(data_mask, axis=-1)
@@ -1077,7 +1063,7 @@ def forward(
 
         output_h = self.dropout(word_emb_k)
         if target_mapping is not None:
-            word_emb_q = self.mask_emb.expand([paddle.shape(target_mapping)[0], bsz, -1])
+            word_emb_q = self.mask_emb.expand([target_mapping.shape[0], bsz, -1])
             output_g = self.dropout(word_emb_q)
         else:
             output_g = None
@@ -1743,19 +1729,17 @@ def forward(
                 print(reshaped_logits.shape)
                 # [2, 2]
         """
-        num_choices = paddle.shape(input_ids)[1] if input_ids is not None else paddle.shape(inputs_embeds)[1]
-        input_ids = input_ids.reshape(shape=(-1, paddle.shape(input_ids)[-1]))  # flat_input_ids: [bs*num_choice,seq_l]
+        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
+        input_ids = input_ids.reshape(shape=(-1, input_ids.shape[-1]))  # flat_input_ids: [bs*num_choice,seq_l]
 
         if attention_mask is not None:
-            attention_mask = attention_mask.reshape(shape=(-1, paddle.shape(attention_mask)[-1]))
+            attention_mask = attention_mask.reshape(shape=(-1, attention_mask.shape[-1]))
 
         if token_type_ids is not None:
-            token_type_ids = token_type_ids.reshape(shape=(-1, paddle.shape(token_type_ids)[-1]))
+            token_type_ids = token_type_ids.reshape(shape=(-1, token_type_ids.shape[-1]))
 
         if inputs_embeds is not None:
-            inputs_embeds = inputs_embeds.reshape(
-                shape=(paddle.shape(inputs_embeds)[0], -1, paddle.shape(inputs_embeds)[-1])
-            )
+            inputs_embeds = inputs_embeds.reshape(shape=(inputs_embeds.shape[0], -1, inputs_embeds.shape[-1]))
 
         transformer_outputs = self.transformer(
             input_ids,
@@ -1920,7 +1904,7 @@ def forward(
             if start_positions.ndim > 1:
                 end_positions = end_positions.squeeze(-1)
             # sometimes the start/end positions are outside our model inputs, we ignore these terms
-            ignored_index = paddle.shape(start_logits)[1]
+            ignored_index = start_logits.shape[1]
             start_positions = start_positions.clip(0, ignored_index)
             end_positions = end_positions.clip(0, ignored_index)
 
diff --git a/tests/test_tipc/benchmark/modules/bert_for_question_answering.py b/tests/test_tipc/benchmark/modules/bert_for_question_answering.py
index d99a81f8898d..1005a0dccea2 100644
--- a/tests/test_tipc/benchmark/modules/bert_for_question_answering.py
+++ b/tests/test_tipc/benchmark/modules/bert_for_question_answering.py
@@ -75,7 +75,7 @@ def forward(self, model, args, input_data=None, **kwargs):
             if start_positions.ndim > 1:
                 end_positions = end_positions.squeeze(-1)
             # sometimes the start/end positions are outside our model inputs, we ignore these terms
-            ignored_index = paddle.shape(start_logits)[1]
+            ignored_index = start_logits.shape[1]
             start_positions = start_positions.clip(0, ignored_index)
             end_positions = end_positions.clip(0, ignored_index)
 
diff --git a/tests/test_tipc/transformer/modeling.py b/tests/test_tipc/transformer/modeling.py
index 7ecb8849f8fc..c263b5a8b7f7 100644
--- a/tests/test_tipc/transformer/modeling.py
+++ b/tests/test_tipc/transformer/modeling.py
@@ -446,7 +446,7 @@ def _merge_batch_beams_with_var_dim(self, c):
         return c
 
     def _split_batch_beams_with_var_dim(self, c):
-        var_dim_size = paddle.shape(c)[self.var_dim_in_state]
+        var_dim_size = c.shape[self.var_dim_in_state]
         c = paddle.reshape(
             c,
             [-1, self.beam_size]
@@ -509,14 +509,14 @@ def step(self, time, inputs, states, **kwargs):
 
         if kwargs.get("trg_word", None) is not None:
             if paddle.in_dynamic_mode():
-                if paddle.shape(kwargs.get("trg_word"))[1] > time:
+                if kwargs.get("trg_word").shape[1] > time:
                     beam_search_output, beam_search_state = self.force_decoding(
                         beam_search_output, beam_search_state, kwargs.get("trg_word"), kwargs.get("trg_length"), time
                     )
             else:
 
                 def condition(trg_word, time):
-                    return paddle.shape(trg_word)[1] > time
+                    return trg_word.shape[1] > time
 
                 def default_fn(beam_search_output, beam_search_state):
                     return beam_search_output, beam_search_state
@@ -547,8 +547,8 @@ def default_fn(beam_search_output, beam_search_state):
         return (beam_search_output, beam_search_state, next_inputs, finished)
 
     def force_decoding(self, beam_search_output, beam_search_state, trg_word, trg_length, time):
-        batch_size = paddle.shape(beam_search_output.predicted_ids)[0]
-        beam_size = paddle.shape(beam_search_output.predicted_ids)[1]
+        batch_size = beam_search_output.predicted_ids.shape[0]
+        beam_size = beam_search_output.predicted_ids.shape[1]
 
         ids_dtype = beam_search_output.predicted_ids.dtype
         scores_dtype = beam_search_output.scores.dtype
@@ -735,8 +735,8 @@ def forward(self, src_word, trg_word):
                     src_word=paddle.randint(low=3, high=30000, shape=[batch_size, seq_len]),
                     trg_word=paddle.randint(low=3, high=30000, shape=[batch_size, seq_len]))
         """
-        src_max_len = paddle.shape(src_word)[-1]
-        trg_max_len = paddle.shape(trg_word)[-1]
+        src_max_len = src_word.shape[-1]
+        trg_max_len = trg_word.shape[-1]
         src_slf_attn_bias = (
             paddle.cast(src_word == self.bos_id, dtype=paddle.get_default_dtype()).unsqueeze([1, 2]) * -1e9
         )
diff --git a/tests/transformer/modeling.py b/tests/transformer/modeling.py
index 7ecb8849f8fc..c263b5a8b7f7 100644
--- a/tests/transformer/modeling.py
+++ b/tests/transformer/modeling.py
@@ -446,7 +446,7 @@ def _merge_batch_beams_with_var_dim(self, c):
         return c
 
     def _split_batch_beams_with_var_dim(self, c):
-        var_dim_size = paddle.shape(c)[self.var_dim_in_state]
+        var_dim_size = c.shape[self.var_dim_in_state]
         c = paddle.reshape(
             c,
             [-1, self.beam_size]
@@ -509,14 +509,14 @@ def step(self, time, inputs, states, **kwargs):
 
         if kwargs.get("trg_word", None) is not None:
             if paddle.in_dynamic_mode():
-                if paddle.shape(kwargs.get("trg_word"))[1] > time:
+                if kwargs.get("trg_word").shape[1] > time:
                     beam_search_output, beam_search_state = self.force_decoding(
                         beam_search_output, beam_search_state, kwargs.get("trg_word"), kwargs.get("trg_length"), time
                     )
             else:
 
                 def condition(trg_word, time):
-                    return paddle.shape(trg_word)[1] > time
+                    return trg_word.shape[1] > time
 
                 def default_fn(beam_search_output, beam_search_state):
                     return beam_search_output, beam_search_state
@@ -547,8 +547,8 @@ def default_fn(beam_search_output, beam_search_state):
         return (beam_search_output, beam_search_state, next_inputs, finished)
 
     def force_decoding(self, beam_search_output, beam_search_state, trg_word, trg_length, time):
-        batch_size = paddle.shape(beam_search_output.predicted_ids)[0]
-        beam_size = paddle.shape(beam_search_output.predicted_ids)[1]
+        batch_size = beam_search_output.predicted_ids.shape[0]
+        beam_size = beam_search_output.predicted_ids.shape[1]
 
         ids_dtype = beam_search_output.predicted_ids.dtype
         scores_dtype = beam_search_output.scores.dtype
@@ -735,8 +735,8 @@ def forward(self, src_word, trg_word):
                     src_word=paddle.randint(low=3, high=30000, shape=[batch_size, seq_len]),
                     trg_word=paddle.randint(low=3, high=30000, shape=[batch_size, seq_len]))
         """
-        src_max_len = paddle.shape(src_word)[-1]
-        trg_max_len = paddle.shape(trg_word)[-1]
+        src_max_len = src_word.shape[-1]
+        trg_max_len = trg_word.shape[-1]
         src_slf_attn_bias = (
             paddle.cast(src_word == self.bos_id, dtype=paddle.get_default_dtype()).unsqueeze([1, 2]) * -1e9
         )

From 5ce51a3dc769be1ea409c51ce5d80e0cc7c6ccd2 Mon Sep 17 00:00:00 2001
From: Wang Huan <wanghuan29@baidu.com>
Date: Mon, 15 Apr 2024 08:00:37 +0000
Subject: [PATCH 2/3] refine

---
 paddlenlp/experimental/transformers/llama/modeling.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddlenlp/experimental/transformers/llama/modeling.py b/paddlenlp/experimental/transformers/llama/modeling.py
index a67731916a92..f22eecb15d19 100644
--- a/paddlenlp/experimental/transformers/llama/modeling.py
+++ b/paddlenlp/experimental/transformers/llama/modeling.py
@@ -451,7 +451,7 @@ def forward(
                 seq_lens=seq_lens,
                 rotary_embs=new_rope,
                 rotary_emb_dims=1,
-                time_step=paddle.increment(attention_mask.shape[-1], -1) if is_decoder else None,
+                time_step=paddle.increment(paddle.shape(attention_mask)[-1], -1) if is_decoder else None,
             )
         hidden_states = self.norm(hidden_states)
 

From 4b044ef09a551edcd98563ee041297dcc4f255fc Mon Sep 17 00:00:00 2001
From: Wang Huan <wanghuan29@baidu.com>
Date: Mon, 15 Apr 2024 08:48:17 +0000
Subject: [PATCH 3/3] refine

---
 paddlenlp/experimental/transformers/bloom/modeling.py      | 2 +-
 paddlenlp/experimental/transformers/chatglm_v2/modeling.py | 2 +-
 paddlenlp/experimental/transformers/gpt/modeling.py        | 2 +-
 paddlenlp/experimental/transformers/opt/modeling.py        | 2 +-
 paddlenlp/experimental/transformers/qwen/modeling.py       | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/paddlenlp/experimental/transformers/bloom/modeling.py b/paddlenlp/experimental/transformers/bloom/modeling.py
index fbb983622fef..659826fe6f1b 100644
--- a/paddlenlp/experimental/transformers/bloom/modeling.py
+++ b/paddlenlp/experimental/transformers/bloom/modeling.py
@@ -279,7 +279,7 @@ def forward(
                 pre_caches=pre_caches,
                 pre_caches_length=position_offset,
                 seq_lens=seq_len,
-                time_step=paddle.increment(attention_mask.shape[-1], -1) if is_decoder else None,
+                time_step=paddle.increment(paddle.shape(attention_mask)[-1], -1) if is_decoder else None,
             )
 
         # Add last hidden state
diff --git a/paddlenlp/experimental/transformers/chatglm_v2/modeling.py b/paddlenlp/experimental/transformers/chatglm_v2/modeling.py
index 712f03dde2fe..75dd08396398 100644
--- a/paddlenlp/experimental/transformers/chatglm_v2/modeling.py
+++ b/paddlenlp/experimental/transformers/chatglm_v2/modeling.py
@@ -285,7 +285,7 @@ def forward(
                 seq_lens=seq_lens,
                 rotary_embs=paddle.cast(rotary_pos_emb, "float32"),
                 rotary_emb_dims=1,
-                time_step=paddle.increment(attention_mask.shape[-1], -1) if is_decoder else None,
+                time_step=paddle.increment(paddle.shape(attention_mask)[-1], -1) if is_decoder else None,
             )
 
         hidden_states = self.final_layernorm(hidden_states)
diff --git a/paddlenlp/experimental/transformers/gpt/modeling.py b/paddlenlp/experimental/transformers/gpt/modeling.py
index b987c7a70974..6627c9e42abb 100644
--- a/paddlenlp/experimental/transformers/gpt/modeling.py
+++ b/paddlenlp/experimental/transformers/gpt/modeling.py
@@ -265,7 +265,7 @@ def forward(
                 attn_mask=paddle.cast(attention_mask, dtype=hidden_states.dtype),
                 caches=cache_kvs,
                 seq_lens=seq_lens,
-                time_step=paddle.increment(attention_mask.shape[-1], -1) if is_decoder else None,
+                time_step=paddle.increment(paddle.shape(attention_mask)[-1], -1) if is_decoder else None,
             )
 
         hidden_states = self.norm(hidden_states)
diff --git a/paddlenlp/experimental/transformers/opt/modeling.py b/paddlenlp/experimental/transformers/opt/modeling.py
index 2f8228a75947..afcb1331b52c 100644
--- a/paddlenlp/experimental/transformers/opt/modeling.py
+++ b/paddlenlp/experimental/transformers/opt/modeling.py
@@ -247,7 +247,7 @@ def forward(
                 seq_lens=seq_lens,
                 rotary_embs=None,
                 rotary_emb_dims=0,
-                time_step=paddle.increment(attention_mask.shape[-1], -1) if is_decoder else None,
+                time_step=paddle.increment(paddle.shape(attention_mask)[-1], -1) if is_decoder else None,
             )
 
         output = hidden_states
diff --git a/paddlenlp/experimental/transformers/qwen/modeling.py b/paddlenlp/experimental/transformers/qwen/modeling.py
index 975bf22abf15..fc6bb92a627d 100644
--- a/paddlenlp/experimental/transformers/qwen/modeling.py
+++ b/paddlenlp/experimental/transformers/qwen/modeling.py
@@ -340,7 +340,7 @@ def forward(
                 seq_lens=seq_lens,
                 rotary_embs=new_rope,
                 rotary_emb_dims=1,
-                time_step=paddle.increment(attention_mask.shape[-1], -1) if is_decoder else None,
+                time_step=paddle.increment(paddle.shape(attention_mask)[-1], -1) if is_decoder else None,
             )
 
         hidden_states = self.ln_f(hidden_states)