File tree Expand file tree Collapse file tree 2 files changed +4
-3
lines changed Expand file tree Collapse file tree 2 files changed +4
-3
lines changed Original file line number Diff line number Diff line change @@ -45,8 +45,9 @@ utils/filter_scp.pl -f 1 $tmp/utt.list $tmp/tmp_wav.scp | sort -k 1 | uniq > $tm
45
45
python -c " import jieba" 2> /dev/null || \
46
46
(echo " jieba is not found. Use tools/extra/install_jieba.sh to install it." && exit 1; )
47
47
utils/filter_scp.pl -f 1 $tmp /utt.list $corpus /trans.txt | sort -k 1 | uniq > $tmp /trans.txt
48
- awk ' {print $1}' $dict_dir /lexicon.txt | sort | uniq | awk ' BEGIN{idx=0}{print $1,idx++}' > $tmp /vocab.txt
49
- python local/word_segmentation.py $tmp /vocab.txt $tmp /trans.txt > $tmp /text
48
+ # jieba's vocab format requires word count(frequency), set to 99
49
+ awk ' {print $1}' $dict_dir /lexicon.txt | sort | uniq | awk ' {print $1,99}' > $tmp /word_seg_vocab.txt
50
+ python local/word_segmentation.py $tmp /word_seg_vocab.txt $tmp /trans.txt > $tmp /text
50
51
51
52
# utt2spk & spk2utt
52
53
awk -F' \t' ' {print $2}' $tmp /wav.scp > $tmp /wav.list
Original file line number Diff line number Diff line change 19
19
jieba .set_dictionary (vocab_file )
20
20
for line in open (trans_file ):
21
21
key ,trans = line .strip ().split ('\t ' ,1 )
22
- words = jieba .cut (trans )
22
+ words = jieba .cut (trans , HMM = False ) # turn off new word discovery (HMM-based )
23
23
new_line = key + '\t ' + " " .join (words )
24
24
print (new_line )
You can’t perform that action at this time.
0 commit comments