18
18
# See the License for the specific language governing permissions and
19
19
# limitations under the License.
20
20
"""Paddle Qwen2 model."""
21
+ from __future__ import annotations
21
22
22
23
import math
23
24
import warnings
@@ -187,11 +188,11 @@ def scaled_dot_product_attention(
187
188
else :
188
189
# [ bz, seqlen, nhead, head_dim] -> [bs, nhead, seq_len, head_dim]
189
190
query_states = paddle .transpose (query_states , [0 , 2 , 1 , 3 ])
190
- # merge with the next tranpose
191
+ # merge with the next transpose
191
192
key_states = paddle .transpose (key_states , [0 , 2 , 1 , 3 ])
192
193
value_states = paddle .transpose (value_states , [0 , 2 , 1 , 3 ])
193
194
194
- # matmul and devide by sqrt(head_dim)
195
+ # matmul and divide by sqrt(head_dim)
195
196
attn_weights = paddle .matmul (query_states / math .sqrt (head_dim ), key_states .transpose ([0 , 1 , 3 , 2 ]))
196
197
197
198
if attn_weights .shape != [bsz , num_heads , q_len , kv_seq_len ]:
@@ -1127,7 +1128,7 @@ def forward(self, prediction_scores, masked_lm_labels):
1127
1128
if self .enable_parallel_cross_entropy :
1128
1129
if prediction_scores .shape [- 1 ] == self .config .vocab_size :
1129
1130
warnings .warn (
1130
- f"enable_parallel_cross_entropy, the vocab_size should be splited : { prediction_scores .shape [- 1 ]} , { self .config .vocab_size } "
1131
+ f"enable_parallel_cross_entropy, the vocab_size should be splitted : { prediction_scores .shape [- 1 ]} , { self .config .vocab_size } "
1131
1132
)
1132
1133
self .loss_func = paddle .nn .CrossEntropyLoss (reduction = "none" , ignore_index = self .ignore_index )
1133
1134
@@ -1202,14 +1203,7 @@ def get_decoder(self):
1202
1203
return self .qwen2
1203
1204
1204
1205
def prepare_inputs_for_generation (
1205
- self ,
1206
- input_ids ,
1207
- use_cache = False ,
1208
- past_key_values = None ,
1209
- attention_mask = None ,
1210
- inputs_embeds = None ,
1211
- output_router_logits = False ,
1212
- ** kwargs
1206
+ self , input_ids , use_cache = False , past_key_values = None , attention_mask = None , inputs_embeds = None , ** kwargs
1213
1207
):
1214
1208
batch_size , seq_length = input_ids .shape
1215
1209
position_ids = kwargs .get ("position_ids" , paddle .arange (seq_length ).expand ((batch_size , seq_length )))
@@ -1230,7 +1224,6 @@ def prepare_inputs_for_generation(
1230
1224
"past_key_values" : past_key_values ,
1231
1225
"use_cache" : use_cache ,
1232
1226
"attention_mask" : attention_mask ,
1233
- "output_router_logits" : output_router_logits ,
1234
1227
}
1235
1228
)
1236
1229
return model_inputs
@@ -1325,7 +1318,7 @@ def forward(
1325
1318
hidden_states = outputs [0 ]
1326
1319
1327
1320
# if labels is None,means we need full output, instead of tensor_parallel_output
1328
- # tensor_parallel_output is togather with ParallelCrossEntropy
1321
+ # tensor_parallel_output is together with ParallelCrossEntropy
1329
1322
tensor_parallel_output = (
1330
1323
self .config .tensor_parallel_output and labels is not None and self .config .tensor_parallel_degree > 1
1331
1324
)
0 commit comments