Skip to content

Commit a4749e1

Browse files
authored
Add ft compile doc and scripts (#3292)
* Fix the mac compile * Add cpp, python lib building scripts * Remove cache in cpp lib * Add compile docs
1 parent 9a25764 commit a4749e1

File tree

9 files changed

+173
-1
lines changed

9 files changed

+173
-1
lines changed

faster_tokenizer/README.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,3 +99,7 @@ A:在有三种情况下,打开`use_faster=True`开关可能无法提升性
9999
2. 加载的Tokenizer类型暂不支持Faster版本。目前支持4种Tokenizer的Faster版本,分别是BERT、ERNIE、TinyBERT以及ERNIE-M Tokenizer。若加载不支持Faster版本的Tokenizer情况下打开`use_faster`开关,PaddleNLP会给出以下warning:"The tokenizer XXX doesn't have the faster version. Please check the map paddlenlp.transformers.auto.tokenizer.FASTER_TOKENIZER_MAPPING_NAMES to see which faster tokenizers are currently supported."
100100

101101
3. 待切词文本长度过短(如文本平均长度小于5)。这种情况下切词开销可能不是整个文本预处理的性能瓶颈,导致在使用FasterTokenizer后仍无法提升整体性能。
102+
103+
## 相关文档
104+
105+
[FasterTokenizer编译指南](docs/compile/README.md)
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# FasterTokenizer编译指南
2+
3+
本文档说明编译FasterTokenizer C++库、Python库两种编译过程,根据编译的平台参考如下文档
4+
5+
- [Linux & Mac 编译](./how_to_build_linux_and_mac.md)
6+
- [Windows编译](./how_to_build_windows.md)
7+
8+
FasterTokenizer使用CMake编译,其中编译过程中,各平台上编译选项如下表所示
9+
10+
| 选项 | 作用 | 备注 |
11+
|:---- | :--- | :--- |
12+
| WITH_PYTHON | 是否编译Python库,默认为是 |
13+
| WITH_TESTING | 是否编译C++单测,默认为否 |
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
# Linux & Mac编译
2+
3+
## 环境依赖
4+
5+
- cmake >= 3.10
6+
- gcc >= 8.2.0
7+
8+
## 编译C++库方法
9+
10+
```bash
11+
git clone https://github.com/PaddlePaddle/PaddleNLP.git
12+
cd PaddleNLP/faster_tokenizer
13+
mkdir build & cd build
14+
cmake .. -DWITH_PYTHON=OFF -DWITH_TESTING=OFF -DCMAKE_BUILD_TYPE=Release
15+
make -j8
16+
```
17+
18+
编译后的C++库在当前目录下的`cpp`目录下。
19+
20+
## 编译Python库方法
21+
22+
```bash
23+
git clone https://github.com/PaddlePaddle/PaddleNLP.git
24+
cd PaddleNLP/faster_tokenizer
25+
mkdir build & cd build
26+
# 设置Python环境
27+
export LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH}
28+
export PATH=/opt/_internal/cpython-3.6.0/bin/:${PATH}
29+
30+
cmake .. -DWITH_PYTHON=ON -DWITH_TESTING=OFF -DCMAKE_BUILD_TYPE=Release
31+
make -j8
32+
```
33+
34+
编译后的wheel包即在当前目录下的`dist`目录中
35+
36+
更多编译选项说明参考[编译指南](./README.md)
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
# Windows 编译
2+
3+
## 环境依赖
4+
5+
- cmake >= 3.10
6+
- VS 2019
7+
- ninja
8+
- cmake >= 3.10
9+
10+
以上依赖安装好后,在Windows菜单打开`x64 Native Tools Command Prompt for VS 2019`命令工具即可进行下面的编译环节。
11+
12+
## 编译C++库方法
13+
14+
```bash
15+
git clone https://github.com/PaddlePaddle/PaddleNLP.git
16+
cd PaddleNLP/faster_tokenizer
17+
mkdir build & cd build
18+
cmake .. -G "Ninja" -DWITH_PYTHON=OFF -DWITH_TESTING=OFF -DCMAKE_BUILD_TYPE=Release
19+
ninja -j8
20+
```
21+
22+
编译后的C++库在当前目录下的`cpp`目录下。
23+
24+
## 编译Python库方法
25+
26+
```bash
27+
git clone https://github.com/PaddlePaddle/PaddleNLP.git
28+
cd PaddleNLP/faster_tokenizer
29+
mkdir build & cd build
30+
# 需要指定Python库
31+
cmake .. -G "Ninja" -DWITH_PYTHON=ON ^
32+
-DWITH_TESTING=OFF ^
33+
-DCMAKE_BUILD_TYPE=Release ^
34+
-DPYTHON_EXECUTABLE=C:\Python37\python.exe ^
35+
-DPYTHON_INCLUDE_DIR=C:\Python37\include ^
36+
-DPYTHON_LIBRARY=C:\Python37\libs\python3%%x.lib
37+
ninja -j8
38+
```
39+
40+
编译后的wheel包即在当前目录下的`dist`目录中
41+
42+
更多编译选项说明参考[编译指南](./README.md)

faster_tokenizer/faster_tokenizer/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ add_subdirectory(postprocessors)
66
add_subdirectory(core)
77
add_subdirectory(utils)
88
# set the relative path of shared library
9-
if (UNIX)
9+
if (NOT APPLE AND NOT WIN32)
1010
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,-rpath='$ORIGIN'")
1111
endif()
1212

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
if not exist build_cpp mkdir build_cpp
2+
cd build_cpp
3+
for /d %%G in ("*") do rmdir /s /q "%%G"
4+
del /q *
5+
cmake .. -G "Ninja" -DWITH_PYTHON=OFF -DWITH_TESTING=OFF -DCMAKE_BUILD_TYPE=Release
6+
ninja -j20
7+
cd ..

faster_tokenizer/run_build_cpp_lib.sh

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
# Can be used in linux and mac
16+
mkdir -p build_cpp
17+
cd build_cpp
18+
rm -rf *
19+
cmake .. -DWITH_PYTHON=ON -DWITH_TESTING=OFF -DCMAKE_BUILD_TYPE=Release
20+
make -j48
21+
cd ..

faster_tokenizer/run_build_py_lib.bat

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
for %%x in (6 7 8 9) do (
2+
if not exist build_py3%%x mkdir build_py3%%x
3+
cd build_py3%%x
4+
for /d %%G in ("*") do rmdir /s /q "%%G"
5+
del /q *
6+
cmake .. -G "Ninja" -DWITH_PYTHON=ON ^
7+
-DWITH_TESTING=OFF ^
8+
-DCMAKE_BUILD_TYPE=Release ^
9+
-DPYTHON_EXECUTABLE=C:\Python3%%x\python.exe ^
10+
-DPYTHON_INCLUDE_DIR=C:\Python3%%x\include ^
11+
-DPYTHON_LIBRARY=C:\Python3%%x\libs\python3%%x.lib
12+
ninja -j20
13+
cd ..
14+
)

faster_tokenizer/run_build_py_lib.sh

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
# Can be used in linux and mac
16+
# build python lib
17+
mkdir -p build_py36 build_py37 build_py38 build_py39
18+
for py_version in 6 7 8 9;
19+
do
20+
cd build_py3${py_version}
21+
rm -rf *
22+
platform="$(uname -s)"
23+
if [[ $platform == Linux* ]];
24+
then
25+
export LD_LIBRARY_PATH=/opt/_internal/cpython-3.${py_version}.0/lib/:${LD_LIBRARY_PATH}
26+
export PATH=/opt/_internal/cpython-3.${py_version}.0/bin/:${PATH}
27+
else
28+
export LD_LIBRARY_PATH=/Users/paddle/miniconda2/envs/py3${py_version}/lib/:${LD_LIBRARY_PATH}
29+
export PATH=/Users/paddle/miniconda2/envs/py3${py_version}/bin/:${PATH}
30+
fi
31+
cmake .. -DWITH_PYTHON=ON -DWITH_TESTING=OFF -DCMAKE_BUILD_TYPE=Release
32+
make -j24
33+
cd ..
34+
done
35+

0 commit comments

Comments
 (0)