Skip to content

Commit 474aaaa

Browse files
authored
fix llama3 eot. (#8371)
1 parent d9f555e commit 474aaaa

File tree

1 file changed

+5
-2
lines changed

1 file changed

+5
-2
lines changed

paddlenlp/transformers/llama/tokenizer.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -295,11 +295,12 @@ def _pad(
295295
ENDOFTEXT = "<|end_of_text|>"
296296
IMSTART = "<|start_header_id|>"
297297
IMEND = "<|end_header_id|>"
298+
EOTID = "<|eot_id|>"
298299
# as the default behavior is changed to allow special tokens in
299300
# regular texts, the surface forms of special tokens need to be
300301
# as different as possible to minimize the impact
301-
EXTRAS = tuple((f"<|reserved_special_token_{i}|>" for i in range(250)))
302-
SPECIAL_TOKENS = (BEGINOFTEXT, ENDOFTEXT) + EXTRAS[0:4] + (IMSTART, IMEND) + EXTRAS[4:]
302+
EXTRAS = tuple((f"<|reserved_special_token_{i}|>" for i in range(251)))
303+
SPECIAL_TOKENS = (BEGINOFTEXT, ENDOFTEXT) + EXTRAS[0:4] + (IMSTART, IMEND, EXTRAS[4], EOTID) + EXTRAS[5:]
303304

304305
tiktoken = None
305306

@@ -354,9 +355,11 @@ def __init__(
354355

355356
self.tokenizer = enc # type: tiktoken.Encoding
356357

358+
self.bod_id = self.special_tokens[BEGINOFTEXT]
357359
self.eod_id = self.special_tokens[ENDOFTEXT]
358360
self.start_header_id = self.special_tokens[IMSTART]
359361
self.end_header_id = self.special_tokens[IMEND]
362+
self.eot_id = self.special_tokens[EOTID]
360363

361364
if "pad_token_id" in kwargs:
362365
self.pad_token_id = kwargs["pad_token_id"]

0 commit comments

Comments
 (0)