@@ -1156,6 +1156,31 @@ def test_maximum_encoding_length_pair_input(self):
1156
1156
1157
1157
# self.assertEqual(encoded_masked, encoded_1)
1158
1158
1159
+ def test_special_token_addition (self ):
1160
+ for tokenizer , pretrained_name , kwargs in self .tokenizers_list :
1161
+ with self .subTest (f"{ tokenizer .__class__ .__name__ } ({ pretrained_name } )" ):
1162
+ # Create tokenizer and add an additional special token
1163
+ tokenizer_1 = tokenizer .from_pretrained (pretrained_name )
1164
+ tokenizer_1 .add_special_tokens ({"additional_special_tokens" : ["<tok>" ]})
1165
+ self .assertEqual (tokenizer_1 .additional_special_tokens , ["<tok>" ])
1166
+ with tempfile .TemporaryDirectory () as tmp_dir :
1167
+ tokenizer_1 .save_pretrained (tmp_dir )
1168
+ # Load the above tokenizer and add the same special token a second time
1169
+ tokenizer_2 = tokenizer .from_pretrained (pretrained_name )
1170
+ tokenizer_2 .add_special_tokens ({"additional_special_tokens" : ["<tok>" ]})
1171
+ self .assertEqual (tokenizer_2 .additional_special_tokens , ["<tok>" ])
1172
+
1173
+ tokenizer_2 .add_special_tokens ({"additional_special_tokens" : ["<tok>" , "<other>" ]})
1174
+ self .assertEqual (tokenizer_2 .additional_special_tokens , ["<tok>" , "<other>" ])
1175
+ tokenizer_2 .add_special_tokens ({"additional_special_tokens" : ["<other>" , "<another>" ]})
1176
+ self .assertEqual (tokenizer_2 .additional_special_tokens , ["<other>" , "<another>" ])
1177
+
1178
+ tokenizer_2 .add_special_tokens (
1179
+ {"additional_special_tokens" : ["<tok>" ]},
1180
+ replace_additional_special_tokens = False ,
1181
+ )
1182
+ self .assertEqual (tokenizer_2 .additional_special_tokens , ["<other>" , "<another>" , "<tok>" ])
1183
+
1159
1184
def test_special_tokens_mask (self ):
1160
1185
tokenizers = self .get_tokenizers (do_lower_case = False )
1161
1186
for tokenizer in tokenizers :
0 commit comments