Skip to content

Commit f653728

Browse files
authored
[FastTokenizer]Fix json of tokenizers (#4165)
* Fix json of tokenizers * Add c_wrap * Update version
1 parent ab67ff2 commit f653728

File tree

4 files changed

+539
-507
lines changed

4 files changed

+539
-507
lines changed

fast_tokenizer/fast_tokenizer/core/tokenizer.cc

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,19 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212
See the License for the specific language governing permissions and
1313
limitations under the License. */
1414

15+
#include "fast_tokenizer/core/tokenizer.h"
16+
1517
#include <fstream>
16-
#include "glog/logging.h"
1718

1819
#include "fast_tokenizer/core/added_vocabulary.h"
1920
#include "fast_tokenizer/core/base.h"
2021
#include "fast_tokenizer/core/encoding.h"
21-
#include "fast_tokenizer/core/tokenizer.h"
22-
2322
#include "fast_tokenizer/decoders/decoders.h"
2423
#include "fast_tokenizer/models/models.h"
2524
#include "fast_tokenizer/normalizers/normalizers.h"
2625
#include "fast_tokenizer/postprocessors/postprocessors.h"
2726
#include "fast_tokenizer/pretokenizers/pretokenizers.h"
27+
#include "glog/logging.h"
2828

2929
namespace paddlenlp {
3030
namespace fast_tokenizer {
@@ -685,7 +685,12 @@ void from_json(const nlohmann::json& j, Tokenizer& tokenizer) {
685685
}
686686

687687
// deserialize pretokenizer_
688-
const auto& pretokenizer = j.at("pretokenizer");
688+
nlohmann::json pretokenizer;
689+
if (j.find("pretokenizer") == j.end()) {
690+
pretokenizer = j.at("pre_tokenizer");
691+
} else {
692+
pretokenizer = j.at("pretokenizer");
693+
}
689694
if (!pretokenizer.is_null()) {
690695
if (pretokenizer.at("type") == "BertPreTokenizer") {
691696
pretokenizers::BertPreTokenizer bert_pretokenizer;
@@ -735,7 +740,12 @@ void from_json(const nlohmann::json& j, Tokenizer& tokenizer) {
735740
}
736741

737742
// deserialize post_processor_
738-
const auto& post_processor = j.at("postprocessor");
743+
nlohmann::json post_processor;
744+
if (j.find("postprocessor") == j.end()) {
745+
post_processor = j.at("post_processor");
746+
} else {
747+
post_processor = j.at("postprocessor");
748+
}
739749
if (!post_processor.is_null()) {
740750
if (post_processor.at("type") == "BertPostProcessor") {
741751
postprocessors::BertPostProcessor bert_postprocessor;

fast_tokenizer/fast_tokenizer/normalizers/bert.cc

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,11 @@ See the License for the specific language governing permissions and
1313
limitations under the License. */
1414

1515
#include "fast_tokenizer/normalizers/bert.h"
16+
1617
#include <algorithm>
1718
#include <codecvt>
1819
#include <locale>
20+
1921
#include "fast_tokenizer/normalizers/strip.h"
2022
#include "fast_tokenizer/normalizers/utils.h"
2123
#include "fast_tokenizer/utils/utils.h"
@@ -106,7 +108,11 @@ void from_json(const nlohmann::json& j, BertNormalizer& bert_normalizer) {
106108
j.at("clean_text").get_to(bert_normalizer.clean_text_);
107109
j.at("handle_chinese_chars").get_to(bert_normalizer.handle_chinese_chars_);
108110
j.at("lowercase").get_to(bert_normalizer.lowercase_);
109-
j.at("strip_accents").get_to(bert_normalizer.strip_accents_);
111+
if (!j.at("strip_accents").is_null()) {
112+
j.at("strip_accents").get_to(bert_normalizer.strip_accents_);
113+
} else {
114+
bert_normalizer.strip_accents_ = false;
115+
}
110116
}
111117

112118
} // namespace normalizers

0 commit comments

Comments
 (0)