[egs] Aishell2 recipe: turn off jieba's new word discovery in word segmentation (#2740)

dophist · danpovey · commit 1d079fa98a20 · 2018-09-26T14:36:24.000-04:00
diff --git a/egs/aishell2/s5/local/prepare_data.sh b/egs/aishell2/s5/local/prepare_data.sh
@@ -45,8 +45,9 @@ utils/filter_scp.pl -f 1 $tmp/utt.list $tmp/tmp_wav.scp | sort -k 1 | uniq > $tm
 python -c "import jieba" 2>/dev/null || \
   (echo "jieba is not found. Use tools/extra/install_jieba.sh to install it." && exit 1;)
 utils/filter_scp.pl -f 1 $tmp/utt.list $corpus/trans.txt | sort -k 1 | uniq > $tmp/trans.txt
-awk '{print $1}' $dict_dir/lexicon.txt | sort | uniq | awk 'BEGIN{idx=0}{print $1,idx++}'> $tmp/vocab.txt
-python local/word_segmentation.py $tmp/vocab.txt $tmp/trans.txt > $tmp/text
+# jieba's vocab format requires word count(frequency), set to 99
+awk '{print $1}' $dict_dir/lexicon.txt | sort | uniq | awk '{print $1,99}'> $tmp/word_seg_vocab.txt
+python local/word_segmentation.py $tmp/word_seg_vocab.txt $tmp/trans.txt > $tmp/text
 
 # utt2spk & spk2utt
 awk -F'\t' '{print $2}' $tmp/wav.scp > $tmp/wav.list
diff --git a/egs/aishell2/s5/local/word_segmentation.py b/egs/aishell2/s5/local/word_segmentation.py
@@ -19,6 +19,6 @@
 jieba.set_dictionary(vocab_file)
 for line in open(trans_file):
   key,trans = line.strip().split('\t',1)
-  words = jieba.cut(trans)
+  words = jieba.cut(trans, HMM=False) # turn off new word discovery (HMM-based)
   new_line = key + '\t' + " ".join(words)
   print(new_line)