Skip to content

Commit c5e6db5

Browse files
authored
[Tokenier] Enable padding_side as call time kwargs (#9161)
1 parent cd4e816 commit c5e6db5

File tree

15 files changed

+186
-57
lines changed

15 files changed

+186
-57
lines changed

paddlenlp/transformers/artist/tokenizer.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -225,6 +225,7 @@ def __call__(
225225
return_offsets_mapping=False,
226226
add_special_tokens=True,
227227
pad_to_multiple_of=None,
228+
padding_side=None,
228229
return_tensors=None,
229230
verbose: bool = True,
230231
**kwargs
@@ -247,6 +248,7 @@ def __call__(
247248
return_offsets_mapping,
248249
add_special_tokens,
249250
pad_to_multiple_of,
251+
padding_side,
250252
return_tensors,
251253
verbose,
252254
**kwargs,

paddlenlp/transformers/bloom/tokenizer.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
import os
1919
import shutil
2020
from functools import lru_cache
21-
from typing import Dict, Optional, Union
21+
from typing import Dict, Literal, Optional, Union
2222

2323
import numpy as np
2424
from paddle.utils import try_import
@@ -360,6 +360,7 @@ def _pad(
360360
max_length: Optional[int] = None,
361361
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
362362
pad_to_multiple_of: Optional[int] = None,
363+
padding_side: Optional[Literal["right", "left"]] = None,
363364
return_attention_mask: Optional[bool] = None,
364365
) -> dict:
365366
"""
@@ -375,13 +376,16 @@ def _pad(
375376
- PaddingStrategy.LONGEST Pad to the longest sequence in the batch
376377
- PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
377378
- PaddingStrategy.DO_NOT_PAD: Do not pad
378-
The tokenizer padding sides are defined in self.padding_side:
379+
The tokenizer padding sides are defined in `padding_side` argument:
379380
380381
- 'left': pads on the left of the sequences
381382
- 'right': pads on the right of the sequences
382383
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
383384
This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
384385
>= 7.5 (Volta).
386+
padding_side: (optional) The side on which the model should have padding applied.
387+
Should be selected between ['right', 'left'].
388+
Default value is picked from the class attribute of the same name.
385389
return_attention_mask:
386390
(optional) Set to False to avoid returning attention mask (default: set to model specifics)
387391
"""
@@ -394,7 +398,7 @@ def _pad(
394398

395399
required_input = encoded_inputs[self.model_input_names[0]]
396400
encoded_inputs = super()._pad(
397-
encoded_inputs, max_length, padding_strategy, pad_to_multiple_of, return_attention_mask
401+
encoded_inputs, max_length, padding_strategy, pad_to_multiple_of, padding_side, return_attention_mask
398402
)
399403
if attention_mask is not None and len(np.shape(attention_mask)) > 2:
400404
encoded_inputs["attention_mask"] = attention_mask

paddlenlp/transformers/chatglm/tokenizer.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414

1515
"""Tokenization classes for ChatGLM."""
1616
import os
17-
from typing import Dict, List, Optional, Union
17+
from typing import Dict, List, Literal, Optional, Union
1818

1919
import numpy as np
2020
import sentencepiece as spm
@@ -218,13 +218,15 @@ def _pad(
218218
max_length: Optional[int] = None,
219219
padding_strategy=PaddingStrategy.DO_NOT_PAD,
220220
pad_to_multiple_of: Optional[int] = None,
221+
padding_side: Optional[Literal["right", "left"]] = None,
221222
return_attention_mask: Optional[bool] = None,
222223
) -> dict:
223224
# Load from model defaults
224225
if return_attention_mask is None:
225226
return_attention_mask = "attention_mask" in self.model_input_names or "attention_mask" in encoded_inputs
226227

227-
assert self.padding_side == "left"
228+
padding_side = padding_side if padding_side is not None else self.padding_side
229+
assert padding_side == "left"
228230
required_input = encoded_inputs[self.model_input_names[0]]
229231
seq_length = len(required_input)
230232

paddlenlp/transformers/chatglm_v2/tokenizer.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515

1616
import os
1717
import re
18-
from typing import Any, Dict, List, Optional, Union
18+
from typing import Any, Dict, List, Literal, Optional, Union
1919

2020
import numpy as np
2121
from sentencepiece import SentencePieceProcessor
@@ -244,6 +244,7 @@ def _pad(
244244
max_length: Optional[int] = None,
245245
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
246246
pad_to_multiple_of: Optional[int] = None,
247+
padding_side: Optional[Literal["right", "left"]] = None,
247248
return_attention_mask: Optional[bool] = None,
248249
) -> dict:
249250
"""
@@ -259,18 +260,22 @@ def _pad(
259260
- PaddingStrategy.LONGEST Pad to the longest sequence in the batch
260261
- PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
261262
- PaddingStrategy.DO_NOT_PAD: Do not pad
262-
The tokenizer padding sides are defined in self.padding_side:
263+
The tokenizer padding sides are defined in `padding_side` argument:
263264
264265
- 'left': pads on the left of the sequences
265266
- 'right': pads on the right of the sequences
266267
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
267268
This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
268-
`>= 7.5` (Volta).
269+
>= 7.5 (Volta).
270+
padding_side: (optional) The side on which the model should have padding applied.
271+
Should be selected between ['right', 'left'].
272+
Default value is picked from the class attribute of the same name.
269273
return_attention_mask:
270274
(optional) Set to False to avoid returning attention mask (default: set to model specifics)
271275
"""
272276
# Load from model defaults
273-
assert self.padding_side == "left"
277+
padding_side = padding_side if padding_side is not None else self.padding_side
278+
assert padding_side == "left"
274279

275280
required_input = encoded_inputs[self.model_input_names[0]]
276281
seq_length = len(required_input)

paddlenlp/transformers/dallebart/tokenizer.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -464,6 +464,7 @@ def __call__(
464464
return_offsets_mapping=False,
465465
add_special_tokens=True,
466466
pad_to_multiple_of=None,
467+
padding_side=None,
467468
return_tensors=None,
468469
verbose: bool = True,
469470
**kwargs
@@ -497,6 +498,7 @@ def __call__(
497498
return_offsets_mapping,
498499
add_special_tokens,
499500
pad_to_multiple_of,
501+
padding_side,
500502
return_tensors,
501503
verbose,
502504
**kwargs,

paddlenlp/transformers/gemma/tokenizer.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515

1616
import os
1717
from shutil import copyfile
18-
from typing import Any, Dict, List, Optional, Tuple, Union
18+
from typing import Any, Dict, List, Literal, Optional, Tuple, Union
1919

2020
import numpy as np
2121
import sentencepiece as spm
@@ -323,6 +323,7 @@ def _pad(
323323
max_length: Optional[int] = None,
324324
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
325325
pad_to_multiple_of: Optional[int] = None,
326+
padding_side: Optional[Literal["right", "left"]] = None,
326327
return_attention_mask: Optional[bool] = None,
327328
) -> dict:
328329
"""
@@ -345,6 +346,9 @@ def _pad(
345346
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
346347
This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
347348
>= 7.5 (Volta).
349+
padding_side: (optional) The side on which the model should have padding applied.
350+
Should be selected between ['right', 'left'].
351+
Default value is picked from the class attribute of the same name.
348352
return_attention_mask:
349353
(optional) Set to False to avoid returning attention mask (default: set to model specifics)
350354
"""
@@ -359,7 +363,7 @@ def _pad(
359363

360364
required_input = encoded_inputs[self.model_input_names[0]]
361365
encoded_inputs = super()._pad(
362-
encoded_inputs, max_length, padding_strategy, pad_to_multiple_of, return_attention_mask
366+
encoded_inputs, max_length, padding_strategy, pad_to_multiple_of, padding_side, return_attention_mask
363367
)
364368
if attention_mask is not None and len(np.shape(attention_mask)) > 2:
365369
encoded_inputs["attention_mask"] = attention_mask

paddlenlp/transformers/gpt/tokenizer.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
import os
1818
import shutil
1919
from functools import lru_cache
20-
from typing import Dict, Optional, Union
20+
from typing import Dict, Literal, Optional, Union
2121

2222
import jieba
2323
import numpy as np
@@ -584,6 +584,7 @@ def _pad(
584584
max_length: Optional[int] = None,
585585
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
586586
pad_to_multiple_of: Optional[int] = None,
587+
padding_side: Optional[Literal["right", "left"]] = None,
587588
return_attention_mask: Optional[bool] = None,
588589
) -> dict:
589590
"""
@@ -599,13 +600,16 @@ def _pad(
599600
- PaddingStrategy.LONGEST Pad to the longest sequence in the batch
600601
- PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
601602
- PaddingStrategy.DO_NOT_PAD: Do not pad
602-
The tokenizer padding sides are defined in self.padding_side:
603+
The tokenizer padding sides are defined in `padding_side` argument:
603604
604605
- 'left': pads on the left of the sequences
605606
- 'right': pads on the right of the sequences
606607
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
607608
This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
608609
>= 7.5 (Volta).
610+
padding_side: (optional) The side on which the model should have padding applied.
611+
Should be selected between ['right', 'left'].
612+
Default value is picked from the class attribute of the same name.
609613
return_attention_mask:
610614
(optional) Set to False to avoid returning attention mask (default: set to model specifics)
611615
"""
@@ -620,7 +624,7 @@ def _pad(
620624

621625
required_input = encoded_inputs[self.model_input_names[0]]
622626
encoded_inputs = super()._pad(
623-
encoded_inputs, max_length, padding_strategy, pad_to_multiple_of, return_attention_mask
627+
encoded_inputs, max_length, padding_strategy, pad_to_multiple_of, padding_side, return_attention_mask
624628
)
625629
if attention_mask is not None and len(np.shape(attention_mask)) > 2:
626630
encoded_inputs["attention_mask"] = attention_mask

paddlenlp/transformers/llama/tokenizer.py

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515

1616
import os
1717
from shutil import copyfile
18-
from typing import Dict, List, Optional, Tuple, Union
18+
from typing import Dict, List, Literal, Optional, Tuple, Union
1919

2020
import numpy as np
2121
import sentencepiece as spm
@@ -232,6 +232,7 @@ def _pad(
232232
max_length: Optional[int] = None,
233233
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
234234
pad_to_multiple_of: Optional[int] = None,
235+
padding_side: Optional[Literal["right", "left"]] = None,
235236
return_attention_mask: Optional[bool] = None,
236237
) -> dict:
237238
"""
@@ -247,13 +248,16 @@ def _pad(
247248
- PaddingStrategy.LONGEST Pad to the longest sequence in the batch
248249
- PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
249250
- PaddingStrategy.DO_NOT_PAD: Do not pad
250-
The tokenizer padding sides are defined in self.padding_side:
251+
The tokenizer padding sides are defined in `padding_side` argument:
251252
252253
- 'left': pads on the left of the sequences
253254
- 'right': pads on the right of the sequences
254255
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
255256
This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
256257
>= 7.5 (Volta).
258+
padding_side: (optional) The side on which the model should have padding applied.
259+
Should be selected between ['right', 'left'].
260+
Default value is picked from the class attribute of the same name.
257261
return_attention_mask:
258262
(optional) Set to False to avoid returning attention mask (default: set to model specifics)
259263
"""
@@ -268,7 +272,7 @@ def _pad(
268272

269273
required_input = encoded_inputs[self.model_input_names[0]]
270274
encoded_inputs = super()._pad(
271-
encoded_inputs, max_length, padding_strategy, pad_to_multiple_of, return_attention_mask
275+
encoded_inputs, max_length, padding_strategy, pad_to_multiple_of, padding_side, return_attention_mask
272276
)
273277
if attention_mask is not None and len(np.shape(attention_mask)) > 2:
274278
encoded_inputs["attention_mask"] = attention_mask
@@ -521,6 +525,7 @@ def _pad(
521525
max_length: Optional[int] = None,
522526
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
523527
pad_to_multiple_of: Optional[int] = None,
528+
padding_side: Optional[Literal["right", "left"]] = None,
524529
return_attention_mask: Optional[bool] = None,
525530
) -> dict:
526531
"""
@@ -536,13 +541,16 @@ def _pad(
536541
- PaddingStrategy.LONGEST Pad to the longest sequence in the batch
537542
- PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
538543
- PaddingStrategy.DO_NOT_PAD: Do not pad
539-
The tokenizer padding sides are defined in self.padding_side:
544+
The tokenizer padding sides are defined in `padding_side` argument:
540545
541546
- 'left': pads on the left of the sequences
542547
- 'right': pads on the right of the sequences
543548
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
544549
This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
545550
>= 7.5 (Volta).
551+
padding_side: (optional) The side on which the model should have padding applied.
552+
Should be selected between ['right', 'left'].
553+
Default value is picked from the class attribute of the same name.
546554
return_attention_mask:
547555
(optional) Set to False to avoid returning attention mask (default: set to model specifics)
548556
"""
@@ -557,7 +565,7 @@ def _pad(
557565

558566
required_input = encoded_inputs[self.model_input_names[0]]
559567
encoded_inputs = super()._pad(
560-
encoded_inputs, max_length, padding_strategy, pad_to_multiple_of, return_attention_mask
568+
encoded_inputs, max_length, padding_strategy, pad_to_multiple_of, padding_side, return_attention_mask
561569
)
562570
if attention_mask is not None and len(np.shape(attention_mask)) > 2:
563571
encoded_inputs["attention_mask"] = attention_mask

paddlenlp/transformers/mamba/tokenizer.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
import os
1919
import shutil
2020
from functools import lru_cache
21-
from typing import Dict, Optional, Union
21+
from typing import Dict, Literal, Optional, Union
2222

2323
import numpy as np
2424
from paddle.utils import try_import
@@ -302,6 +302,7 @@ def _pad(
302302
max_length: Optional[int] = None,
303303
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
304304
pad_to_multiple_of: Optional[int] = None,
305+
padding_side: Optional[Literal["right", "left"]] = None,
305306
return_attention_mask: Optional[bool] = None,
306307
) -> dict:
307308
"""
@@ -317,13 +318,16 @@ def _pad(
317318
- PaddingStrategy.LONGEST Pad to the longest sequence in the batch
318319
- PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
319320
- PaddingStrategy.DO_NOT_PAD: Do not pad
320-
The tokenizer padding sides are defined in self.padding_side:
321+
The tokenizer padding sides are defined in `padding_side` argument:
321322
322323
- 'left': pads on the left of the sequences
323324
- 'right': pads on the right of the sequences
324325
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
325326
This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
326327
>= 7.5 (Volta).
328+
padding_side: (optional) The side on which the model should have padding applied.
329+
Should be selected between ['right', 'left'].
330+
Default value is picked from the class attribute of the same name.
327331
return_attention_mask:
328332
(optional) Set to False to avoid returning attention mask (default: set to model specifics)
329333
"""
@@ -338,7 +342,7 @@ def _pad(
338342

339343
required_input = encoded_inputs[self.model_input_names[0]]
340344
encoded_inputs = super()._pad(
341-
encoded_inputs, max_length, padding_strategy, pad_to_multiple_of, return_attention_mask
345+
encoded_inputs, max_length, padding_strategy, pad_to_multiple_of, padding_side, return_attention_mask
342346
)
343347
if attention_mask is not None and len(np.shape(attention_mask)) > 2:
344348
encoded_inputs["attention_mask"] = attention_mask

paddlenlp/transformers/qwen/tokenizer.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
import base64
1818
import os
1919
import unicodedata
20-
from typing import Collection, Dict, List, Optional, Set, Tuple, Union
20+
from typing import Collection, Dict, List, Literal, Optional, Set, Tuple, Union
2121

2222
import numpy as np
2323

@@ -255,6 +255,7 @@ def _pad(
255255
max_length: Optional[int] = None,
256256
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
257257
pad_to_multiple_of: Optional[int] = None,
258+
padding_side: Optional[Literal["right", "left"]] = None,
258259
return_attention_mask: Optional[bool] = None,
259260
) -> dict:
260261
"""
@@ -270,13 +271,16 @@ def _pad(
270271
- PaddingStrategy.LONGEST Pad to the longest sequence in the batch
271272
- PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
272273
- PaddingStrategy.DO_NOT_PAD: Do not pad
273-
The tokenizer padding sides are defined in self.padding_side:
274+
The tokenizer padding sides are defined in `padding_side` argument:
274275
275276
- 'left': pads on the left of the sequences
276277
- 'right': pads on the right of the sequences
277278
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
278279
This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
279280
>= 7.5 (Volta).
281+
padding_side: (optional) The side on which the model should have padding applied.
282+
Should be selected between ['right', 'left'].
283+
Default value is picked from the class attribute of the same name.
280284
return_attention_mask:
281285
(optional) Set to False to avoid returning attention mask (default: set to model specifics)
282286
"""
@@ -291,7 +295,7 @@ def _pad(
291295

292296
required_input = encoded_inputs[self.model_input_names[0]]
293297
encoded_inputs = super()._pad(
294-
encoded_inputs, max_length, padding_strategy, pad_to_multiple_of, return_attention_mask
298+
encoded_inputs, max_length, padding_strategy, pad_to_multiple_of, padding_side, return_attention_mask
295299
)
296300
if attention_mask is not None and len(np.shape(attention_mask)) > 2:
297301
encoded_inputs["attention_mask"] = attention_mask

0 commit comments

Comments
 (0)