[FastTokenizer] Fix fast_tokenizer import (#4126)

joey12300 · web-flow · commit ab67ff29058f · 2022-12-20T11:40:38.000+08:00
* Fix fast_tokenizer import

* use import_module instead of importlib.import_module

* Add auto tokenizer unittest

* update to __internal_testing__

* Add test
diff --git a/paddlenlp/transformers/auto/tokenizer.py b/paddlenlp/transformers/auto/tokenizer.py
@@ -23,7 +23,7 @@
 from paddlenlp import __version__
 from paddlenlp.utils.downloader import COMMUNITY_MODEL_PREFIX, get_path_from_url
 from paddlenlp.utils.env import HF_CACHE_HOME, MODEL_HOME
-from paddlenlp.utils.import_utils import is_fast_tokenizer_available
+from paddlenlp.utils.import_utils import import_module, is_fast_tokenizer_available
 from paddlenlp.utils.log import logger
 
 __all__ = [
@@ -154,13 +154,31 @@ def _get_tokenizer_class_from_config(cls, pretrained_model_name_or_path, config_
 
         if init_class:
             class_name = cls._name_mapping[init_class]
-            import_class = importlib.import_module(f"paddlenlp.transformers.{class_name}.tokenizer")
+            import_class = import_module(f"paddlenlp.transformers.{class_name}.tokenizer")
             tokenizer_class = getattr(import_class, init_class)
             if use_fast:
-                for fast_tokenizer_class, name in cls._fast_name_mapping.items():
-                    if name == class_name:
-                        import_class = importlib.import_module(f"paddlenlp.transformers.{class_name}.fast_tokenizer")
-                        tokenizer_class = getattr(import_class, fast_tokenizer_class)
+                if is_fast_tokenizer_available():
+                    is_support_fast_tokenizer = False
+                    init_class_prefix = init_class[:-9]
+                    for fast_tokenizer_class, name in cls._fast_name_mapping.items():
+                        fast_tokenizer_class_prefix = fast_tokenizer_class[:-9]
+                        if name == class_name and fast_tokenizer_class_prefix.startswith(init_class_prefix):
+                            is_support_fast_tokenizer = True
+                            import_class = import_module(f"paddlenlp.transformers.{class_name}.fast_tokenizer")
+                            tokenizer_class = getattr(import_class, fast_tokenizer_class)
+                            break
+                    if not is_support_fast_tokenizer:
+                        logger.warning(
+                            f"The tokenizer {tokenizer_class} doesn't have the fast version."
+                            " Please check the map `paddlenlp.transformers.auto.tokenizer.FAST_TOKENIZER_MAPPING_NAMES`"
+                            " to see which fast tokenizers are currently supported."
+                        )
+                else:
+                    logger.warning(
+                        "Can't find the fast_tokenizer package, "
+                        "please ensure install fast_tokenizer correctly. "
+                        "You can install fast_tokenizer by `pip install fast-tokenizer-python`."
+                    )
             return tokenizer_class
         # If no `init_class`, we use pattern recognition to recognize the tokenizer class.
         else:
@@ -170,7 +188,7 @@ def _get_tokenizer_class_from_config(cls, pretrained_model_name_or_path, config_
                 if pattern in pretrained_model_name_or_path.lower():
                     init_class = key
                     class_name = cls._name_mapping[init_class]
-                    import_class = importlib.import_module(f"paddlenlp.transformers.{class_name}.tokenizer")
+                    import_class = import_module(f"paddlenlp.transformers.{class_name}.tokenizer")
                     tokenizer_class = getattr(import_class, init_class)
             return tokenizer_class
 
diff --git a/tests/transformers/auto/__init__.py b/tests/transformers/auto/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/tests/transformers/auto/test_tokenizer.py b/tests/transformers/auto/test_tokenizer.py
@@ -0,0 +1,36 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2019 Hugging Face inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import paddlenlp
+from paddlenlp.transformers import AutoTokenizer, is_fast_tokenizer_available
+
+
+class AutoTokenizerTest(unittest.TestCase):
+    def test_fast_tokenizer_import(self):
+        tokenizer1 = AutoTokenizer.from_pretrained("__internal_testing__/bert", use_fast=False)
+        self.assertIsInstance(tokenizer1, paddlenlp.transformers.BertTokenizer)
+
+        tokenizer2 = AutoTokenizer.from_pretrained("__internal_testing__/bert", use_fast=True)
+        if is_fast_tokenizer_available():
+            self.assertIsInstance(tokenizer2, paddlenlp.transformers.BertFastTokenizer)
+        else:
+            self.assertIsInstance(tokenizer2, paddlenlp.transformers.BertTokenizer)
+
+    def test_fast_tokenizer_non_exist(self):
+        tokenizer1 = AutoTokenizer.from_pretrained("t5-small", use_fast=True)
+        # T5 FastTokenizer doesn't exist yet, so from_pretrained will return the normal tokenizer.
+        self.assertIsInstance(tokenizer1, paddlenlp.transformers.T5Tokenizer)