Skip to content

Commit 5bdf751

Browse files
authored
[LLM] Fix Qwen2 (#8584)
* fix output_router_logits * fix with __future__
1 parent 4609d07 commit 5bdf751

File tree

1 file changed

+6
-13
lines changed

1 file changed

+6
-13
lines changed

paddlenlp/transformers/qwen2/modeling.py

Lines changed: 6 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
# See the License for the specific language governing permissions and
1919
# limitations under the License.
2020
"""Paddle Qwen2 model."""
21+
from __future__ import annotations
2122

2223
import math
2324
import warnings
@@ -187,11 +188,11 @@ def scaled_dot_product_attention(
187188
else:
188189
# [ bz, seqlen, nhead, head_dim] -> [bs, nhead, seq_len, head_dim]
189190
query_states = paddle.transpose(query_states, [0, 2, 1, 3])
190-
# merge with the next tranpose
191+
# merge with the next transpose
191192
key_states = paddle.transpose(key_states, [0, 2, 1, 3])
192193
value_states = paddle.transpose(value_states, [0, 2, 1, 3])
193194

194-
# matmul and devide by sqrt(head_dim)
195+
# matmul and divide by sqrt(head_dim)
195196
attn_weights = paddle.matmul(query_states / math.sqrt(head_dim), key_states.transpose([0, 1, 3, 2]))
196197

197198
if attn_weights.shape != [bsz, num_heads, q_len, kv_seq_len]:
@@ -1127,7 +1128,7 @@ def forward(self, prediction_scores, masked_lm_labels):
11271128
if self.enable_parallel_cross_entropy:
11281129
if prediction_scores.shape[-1] == self.config.vocab_size:
11291130
warnings.warn(
1130-
f"enable_parallel_cross_entropy, the vocab_size should be splited: {prediction_scores.shape[-1]}, {self.config.vocab_size}"
1131+
f"enable_parallel_cross_entropy, the vocab_size should be splitted: {prediction_scores.shape[-1]}, {self.config.vocab_size}"
11311132
)
11321133
self.loss_func = paddle.nn.CrossEntropyLoss(reduction="none", ignore_index=self.ignore_index)
11331134

@@ -1202,14 +1203,7 @@ def get_decoder(self):
12021203
return self.qwen2
12031204

12041205
def prepare_inputs_for_generation(
1205-
self,
1206-
input_ids,
1207-
use_cache=False,
1208-
past_key_values=None,
1209-
attention_mask=None,
1210-
inputs_embeds=None,
1211-
output_router_logits=False,
1212-
**kwargs
1206+
self, input_ids, use_cache=False, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
12131207
):
12141208
batch_size, seq_length = input_ids.shape
12151209
position_ids = kwargs.get("position_ids", paddle.arange(seq_length).expand((batch_size, seq_length)))
@@ -1230,7 +1224,6 @@ def prepare_inputs_for_generation(
12301224
"past_key_values": past_key_values,
12311225
"use_cache": use_cache,
12321226
"attention_mask": attention_mask,
1233-
"output_router_logits": output_router_logits,
12341227
}
12351228
)
12361229
return model_inputs
@@ -1325,7 +1318,7 @@ def forward(
13251318
hidden_states = outputs[0]
13261319

13271320
# if labels is None,means we need full output, instead of tensor_parallel_output
1328-
# tensor_parallel_output is togather with ParallelCrossEntropy
1321+
# tensor_parallel_output is together with ParallelCrossEntropy
13291322
tensor_parallel_output = (
13301323
self.config.tensor_parallel_output and labels is not None and self.config.tensor_parallel_degree > 1
13311324
)

0 commit comments

Comments
 (0)