PaddlePaddle
diff --git a/‎fast_tokenizer/CMakeLists.txt
Lines changed: 1 addition & 12 deletions b/‎fast_tokenizer/CMakeLists.txt
Lines changed: 1 addition & 12 deletions
diff --git a/‎fast_tokenizer/FastTokenizer.cmake
Lines changed: 7 additions & 7 deletions b/‎fast_tokenizer/FastTokenizer.cmake
Lines changed: 7 additions & 7 deletions
diff --git a/‎fast_tokenizer/README.md
Lines changed: 17 additions & 4 deletions b/‎fast_tokenizer/README.md
Lines changed: 17 additions & 4 deletions
diff --git a/‎fast_tokenizer/fast_tokenizer/CMakeLists.txt
Lines changed: 1 addition & 1 deletion b/‎fast_tokenizer/fast_tokenizer/CMakeLists.txt
Lines changed: 1 addition & 1 deletion
diff --git a/‎fast_tokenizer/fast_tokenizer/core/CMakeLists.txt
Lines changed: 3 additions & 2 deletions b/‎fast_tokenizer/fast_tokenizer/core/CMakeLists.txt
Lines changed: 3 additions & 2 deletions
diff --git a/‎fast_tokenizer/fast_tokenizer/core/base.cc
Lines changed: 46 additions & 0 deletions b/‎fast_tokenizer/fast_tokenizer/core/base.cc
Lines changed: 46 additions & 0 deletions
diff --git a/‎fast_tokenizer/fast_tokenizer/core/base.h
Lines changed: 7 additions & 0 deletions b/‎fast_tokenizer/fast_tokenizer/core/base.h
Lines changed: 7 additions & 0 deletions
diff --git a/‎fast_tokenizer/fast_tokenizer/core/encoding.cc
Lines changed: 0 additions & 60 deletions b/‎fast_tokenizer/fast_tokenizer/core/encoding.cc
Lines changed: 0 additions & 60 deletions
diff --git a/‎fast_tokenizer/fast_tokenizer/core/encoding.h
Lines changed: 0 additions & 4 deletions b/‎fast_tokenizer/fast_tokenizer/core/encoding.h
Lines changed: 0 additions & 4 deletions
@@ -4,7 +4,7 @@ project(tokenizers LANGUAGES CXX C VERSION 1.0)
 
 option(WITH_TESTING     "Compile PaddleNLP fast_tokenizer with unit testing"        OFF)
 option(WITH_PYTHON      "Compile PaddleNLP fast_tokenizer with python interpreter"   ON)
-add_definitions(-DFASTERTOKENIZER_LIB)
+add_definitions(-DFASTTOKENIZER_LIB)
 
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
 set (PUBLIC_DEPEND_LIBS "")
@@ -108,17 +108,6 @@ ELSE(WIN32)
     set (PUBLIC_DEPEND_LIBS ${CMAKE_DL_LIBS})
 ENDIF(WIN32)
 
-# For OpenMP
-# openmp not support well for now on windows
-if (NOT APPLE AND NOT WIN32) # Linux
-    find_package(OpenMP)
-    if (OPENMP_FOUND)
-        add_definitions(-DWITH_OMP)
-        set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
-        set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
-    endif()
-endif()
-
 set(CMAKE_INSTALL_PREFIX ${PROJECT_SOURCE_DIR})
 set(TOKENIZERS_INSTALL_INCLUDE_DIR ${PROJECT_SOURCE_DIR})
 
 
@@ -16,18 +16,18 @@ endif()
 
 set(LIBRARY_NAME core_tokenizers)
 
-set(FASTER_TOKENIZER_INCS "")
-list(APPEND FASTER_TOKENIZER_INCS ${CMAKE_CURRENT_LIST_DIR}/include)
-list(APPEND FASTER_TOKENIZER_INCS ${CMAKE_CURRENT_LIST_DIR}/third_party/include)
+set(FAST_TOKENIZER_INCS "")
+list(APPEND FAST_TOKENIZER_INCS ${CMAKE_CURRENT_LIST_DIR}/include)
+list(APPEND FAST_TOKENIZER_INCS ${CMAKE_CURRENT_LIST_DIR}/third_party/include)
 
-set(FASTER_TOKENIZER_LIBS "")
+set(FAST_TOKENIZER_LIBS "")
 find_library(FTLIB ${LIBRARY_NAME} ${CMAKE_CURRENT_LIST_DIR}/lib NO_DEFAULT_PATH)
-list(APPEND FASTER_TOKENIZER_LIBS ${FTLIB})
+list(APPEND FAST_TOKENIZER_LIBS ${FTLIB})
 
 if (WIN32)
 find_library(ICUDT icudt ${CMAKE_CURRENT_LIST_DIR}/third_party/lib NO_DEFAULT_PATH)
-list(APPEND FASTER_TOKENIZER_LIBS ${ICUDT})
+list(APPEND FAST_TOKENIZER_LIBS ${ICUDT})
 
 find_library(ICUUC icuuc ${CMAKE_CURRENT_LIST_DIR}/third_party/lib NO_DEFAULT_PATH)
-list(APPEND FASTER_TOKENIZER_LIBS ${ICUUC})
+list(APPEND FAST_TOKENIZER_LIBS ${ICUUC})
 endif()
@@ -17,7 +17,7 @@ FastTokenizer是一款简单易用、功能强大的跨平台高性能文本预
 
 ## 特性
 
-- 高性能。由于底层采用C++实现，所以其性能远高于目前常规Python实现的Tokenizer。在文本分类任务上，FastTokenizer对比Python版本Tokenizer加速比最高可达20倍。
+- 高性能。由于底层采用C++实现，所以其性能远高于目前常规Python实现的Tokenizer。在文本分类任务上，FastTokenizer对比Python版本Tokenizer加速比最高可达20倍。支持多线程加速多文本批处理分词。默认使用单线程分词。
 - 跨平台。FastTokenizer可在不同的系统平台上使用，目前已支持Windows x64，Linux x64以及MacOS 10.14+平台上使用。
 - 多编程语言支持。FastTokenizer提供在C++、Python语言上开发的能力。
 - 灵活性强。用户可以通过指定不同的FastTokenizer组件定制满足需求的Tokenizer。
@@ -26,12 +26,12 @@ FastTokenizer是一款简单易用、功能强大的跨平台高性能文本预
 
 下面将介绍Python版本FastTokenizer的使用方式，C++版本的使用方式可参考[FastTokenizer C++ Demo](./fast_tokenizer/demo/README.md)。
 
-### 前置依赖
+### 环境依赖
 
 - Windows 64位系统
 - Linux x64系统
 - MacOS 10.14+系统（m1芯片的MacOS，需要使用x86_64版本的Anaconda作为python环境方可安装使用）
-- Python 3.6 ~ 3.9
+- Python 3.6 ~ 3.10
 
 ### 安装FastTokenizer
 
@@ -53,7 +53,11 @@ wget https://bj.bcebos.com/paddlenlp/models/transformers/ernie/vocab.txt
 FastTokenizer库内置NLP任务常用的Tokenizer，如ErnieFastTokenizer。下面将展示FastTokenizer的简单用法。
 
 ```python
+import fast_tokenizer
 from fast_tokenizer import ErnieFastTokenizer, models
+
+# 0.（可选）设置线程数
+fast_tokenizer.set_thread_num(1)
 # 1. 加载词表
 vocab = models.WordPiece.read_file("ernie_vocab.txt")
 # 2. 实例化ErnieFastTokenizer对象
@@ -96,10 +100,19 @@ Q：我在AutoTokenizer.from_pretrained接口上已经打开`use_fast=True`开
 A：在有三种情况下，打开`use_fast=True`开关可能无法提升性能：
   1. 没有安装fast_tokenizer。若在没有安装fast_tokenizer库的情况下打开`use_fast`开关，PaddleNLP会给出以下warning："Can't find the fast_tokenizer package, please ensure install fast_tokenizer correctly. "。
 
-  2. 加载的Tokenizer类型暂不支持Fast版本。目前支持4种Tokenizer的Fast版本，分别是BERT、ERNIE、TinyBERT以及ERNIE-M Tokenizer。若加载不支持Fast版本的Tokenizer情况下打开`use_fast`开关，PaddleNLP会给出以下warning："The tokenizer XXX doesn't have the fast version. Please check the map paddlenlp.transformers.auto.tokenizer.FASTER_TOKENIZER_MAPPING_NAMES to see which fast tokenizers are currently supported."
+  2. 加载的Tokenizer类型暂不支持Fast版本。目前支持4种Tokenizer的Fast版本，分别是BERT、ERNIE、TinyBERT以及ERNIE-M Tokenizer。若加载不支持Fast版本的Tokenizer情况下打开`use_fast`开关，PaddleNLP会给出以下warning："The tokenizer XXX doesn't have the fast version. Please check the map paddlenlp.transformers.auto.tokenizer.FAST_TOKENIZER_MAPPING_NAMES to see which fast tokenizers are currently supported."
 
   3. 待切词文本长度过短（如文本平均长度小于5）。这种情况下切词开销可能不是整个文本预处理的性能瓶颈，导致在使用FastTokenizer后仍无法提升整体性能。
 
+Q：如何使用多线程加速分词？
+
+A：可以通过调用 `fast_tokenizer.set_thread_num(xxx)` 使用多线程进行分词。需要谨慎开启多线程加速分词，在以下场景下可以考虑开启多线程：
+  1. CPU资源充足。若在推理阶段使用CPU进行推理，开启多线程分词可能会出现资源竞争情况，从而影响推理阶段的性能。
+
+  2. 文本的批大小较大。若批大小比较小，开启多线程可能不会得到任何加速效果，并且可能会因为线程调度导致延时增长。建议批大小大于4的时候再考虑开启多线程分词。
+
+  3. 文本长度较长。若文本长度较短，开启多线程可能不会得到任何加速效果，并且可能会因为线程调度导致延时增长。建议文本平均长度大于16的时候再考虑开启多线程分词。
+
 ## 相关文档
 
 [FastTokenizer编译指南](docs/compile/README.md)
@@ -33,7 +33,7 @@ endif()
 else(WITH_PYTHON)
 # add_subdirectory(tokenizers)
 cc_library(core_tokenizers SHARED
-           SRCS tokenizers/ernie_fast_tokenizer.cc
+           SRCS tokenizers/ernie_fast_tokenizer.cc tokenizers/clip_fast_tokenizer.cc
            DEPS normalizers pretokenizers models decoders
                 postprocessors core added_vocabulary tokenizer json)
 
 
@@ -1,3 +1,4 @@
 cc_library(added_vocabulary SRCS added_vocabulary.cc DEPS normalizers pretokenizers json)
-cc_library(tokenizer SRCS tokenizer.cc DEPS added_vocabulary json decoders trie models postprocessors)
-cc_library(core SRCS encoding.cc DEPS json)
+cc_library(base SRCS base.cc)
+cc_library(tokenizer SRCS tokenizer.cc DEPS added_vocabulary json decoders trie models postprocessors base)
+cc_library(core SRCS encoding.cc DEPS json base)
@@ -0,0 +1,46 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "fast_tokenizer/core/base.h"
+#include <thread>
+
+namespace paddlenlp {
+namespace fast_tokenizer {
+namespace core {
+
+static int fast_tokenizer_thread_num = 1;
+
+void SetThreadNum(int thread_num) { fast_tokenizer_thread_num = thread_num; }
+
+int GetThreadNum() { return fast_tokenizer_thread_num; }
+
+void RunMultiThread(std::function<void(size_t, size_t)> func,
+                    size_t batch_size) {
+  int thread_num = GetThreadNum();
+  std::vector<std::thread> vectorOfThread;
+  size_t start_index = 0;
+  size_t step_index = ceil(batch_size / float(thread_num));
+
+  for (size_t thread_index = 0; thread_index < thread_num; thread_index++) {
+    vectorOfThread.emplace_back(std::thread(func, start_index, step_index));
+    start_index = start_index + step_index;
+  }
+  for (size_t thread_index = 0; thread_index < thread_num; thread_index++) {
+    vectorOfThread[thread_index].join();
+  }
+}
+
+}  // namespace core
+}  // namespace fast_tokenizer
+}  // namespace paddlenlp
@@ -366,6 +366,13 @@ struct FASTTOKENIZER_DECL BPEWord {
   std::vector<Symbol> symbols_;
 };
 
+FASTTOKENIZER_DECL void SetThreadNum(int thread_num);
+
+FASTTOKENIZER_DECL int GetThreadNum();
+
+FASTTOKENIZER_DECL void RunMultiThread(std::function<void(size_t, size_t)> func,
+                                       size_t batch_size);
+
 }  // namespace core
 }  // namespace fast_tokenizer
 }  // namespace paddlenlp
@@ -19,10 +19,6 @@ limitations under the License. */
 #include <sstream>
 #include "glog/logging.h"
 
-#ifdef WITH_OMP
-#include <omp.h>
-#endif
-
 namespace paddlenlp {
 namespace fast_tokenizer {
 namespace core {
@@ -547,15 +543,6 @@ std::string Encoding::DebugString() const {
     oss << "{" << iter->first << " : (" << iter->second.first << ", "
         << iter->second.second << ") }, ";
   }
-  oss << "\n";
-
-  oss << "words_idx:";
-  for (int i = 0; i < words_idx_.size(); ++i) {
-    oss << words_idx_[i];
-    if (i < words_idx_.size() - 1) {
-      oss << ", ";
-    }
-  }
   return oss.str();
 }
 
@@ -667,62 +654,15 @@ void PadEncodings(std::vector<Encoding>* encodings, const PadMethod& method) {
     pad_length += pad_length - pad_length % method.pad_to_multiple_of_;
   }
   auto batch_size = encodings->size();
-#ifdef WITH_OMP
-#pragma omp parallel for if (batch_size >= 4 && omp_get_max_threads() > 1)
-  for (int i = 0; i < batch_size; ++i) {
-    auto& encoding = (*encodings)[i];
-    encoding.Pad(pad_length,
-                 method.pad_id_,
-                 method.pad_token_type_id_,
-                 method.pad_token_,
-                 method.direction_);
-  }
-#else
   auto func = std::bind(&MultiThreadPadEncodings,
                         encodings,
                         std::ref(method),
                         pad_length,
                         std::placeholders::_1,
                         std::placeholders::_2);
   RunMultiThread(func, batch_size);
-#endif
 }
 
-int GetThreadNum(size_t batch_size) {
-  char* env_var = std::getenv("OMP_NUM_THREADS");
-  int thread_num = std::atoi(env_var);
-  if (batch_size <= 0) {
-    thread_num = 1;
-    VLOG(3) << "batch_size <=0, we set OMP_NUM_THREADS = 1";
-  } else {
-    int best_num = ceil(batch_size / 4.0);
-    if (thread_num > best_num) {
-      thread_num = best_num;
-      VLOG(3) << "OMP_NUM_THREADS > batch_size/4, we set OMP_NUM_THREADS = "
-                 "batch_size/4";
-    } else if (thread_num == 0) {
-      thread_num = best_num;
-      VLOG(3) << "OMP_NUM_THREADS == 0, we set OMP_NUM_THREADS = batch_size/4";
-    }
-  }
-  return thread_num;
-}
-
-void RunMultiThread(std::function<void(size_t, size_t)> func,
-                    size_t batch_size) {
-  int thread_num = GetThreadNum(batch_size);
-  std::vector<std::thread> vectorOfThread;
-  size_t start_index = 0;
-  size_t step_index = ceil(batch_size / float(thread_num));
-
-  for (size_t thread_index = 0; thread_index < thread_num; thread_index++) {
-    vectorOfThread.emplace_back(std::thread(func, start_index, step_index));
-    start_index = start_index + step_index;
-  }
-  for (size_t thread_index = 0; thread_index < thread_num; thread_index++) {
-    vectorOfThread[thread_index].join();
-  }
-}
 
 }  // namespace core
 }  // namespace fast_tokenizer
 
@@ -130,10 +130,6 @@ bool FASTTOKENIZER_DECL TruncateEncodings(Encoding* encoding,
 void FASTTOKENIZER_DECL PadEncodings(std::vector<Encoding>* encoding,
                                      const PadMethod& method);
 
-int FASTTOKENIZER_DECL GetThreadNum(size_t batch_size);
-
-void FASTTOKENIZER_DECL RunMultiThread(std::function<void(size_t, size_t)> func,
-                                       size_t batch_size);
 }  // namespace core
 }  // namespace fast_tokenizer
 }  // namespace paddlenlp