|
| 1 | +#!/bin/bash |
| 2 | +# Copyright 2017 University of Chinese Academy of Sciences (UCAS) Gaofeng Cheng |
| 3 | +# 2018 Xiaohui Zhang |
| 4 | +# 2018 Vimal Manohar |
| 5 | +# Apache 2.0 |
| 6 | + |
| 7 | +# This recipe is similar with tdnn_lstm_1b recipefrom fisher_swbd/s5, and is currently |
| 8 | +# the best performing multi-en recipe. |
| 9 | + |
| 10 | +# System tdnn_opgru_1b_sp tdnn_lstm_1a_sp |
| 11 | +# WER on eval2000(tg) 11.4 11.4 |
| 12 | +# WER on eval2000(fg) 11.2 11.2 |
| 13 | +# WER on rt03(tg) 11.1 10.7 |
| 14 | +# WER on rt03(fg) 10.8 10.5 |
| 15 | +# Final train prob -0.091 -0.095 |
| 16 | +# Final valid prob -0.091 -0.089 |
| 17 | +# Final train prob (xent) -0.990 -0.970 |
| 18 | +# Final valid prob (xent) -0.091 -0.9638 |
| 19 | +# Num-parameters 34976320 39704128 |
| 20 | + |
| 21 | +# ./steps/info/chain_dir_info.pl exp/multi_a/chain/tdnn_lstm_1a_sp |
| 22 | +# exp/multi_a/chain/tdnn_lstm_1a_sp: num-iters=2096 nj=3..16 num-params=39.7M dim=40+100->6176 combine=-0.088->-0.087 (over 3) |
| 23 | +# xent:train/valid[1395,2095,final]=(-1.38,-0.960,-0.970/-1.39,-0.964,-0.964) |
| 24 | +# logprob:train/valid[1395,2095,final]=(-0.117,-0.091,-0.095/-0.109,-0.087,-0.089) |
| 25 | + |
| 26 | +# online results |
| 27 | +# Eval2000 |
| 28 | +# %WER 14.2 | 2628 21594 | 87.8 8.6 3.5 2.1 14.2 49.1 | exp/multi_a/chain/tdnn_lstm_1a_sp_online/decode_eval2000/score_8_0.0/eval2000_hires.ctm.callhm.filt.sys |
| 29 | +# %WER 11.4 | 4459 42989 | 90.3 7.0 2.7 1.7 11.4 46.1 | exp/multi_a/chain/tdnn_lstm_1a_sp_online/decode_eval2000/score_8_0.0/eval2000_hires.ctm.filt.sys |
| 30 | +# %WER 8.4 | 1831 21395 | 92.8 5.3 2.0 1.2 8.4 41.2 | exp/multi_a/chain/tdnn_lstm_1a_sp_online/decode_eval2000/score_9_0.0/eval2000_hires.ctm.swbd.filt.sys |
| 31 | +# %WER 14.0 | 2628 21594 | 88.0 8.5 3.4 2.1 14.0 48.6 | exp/multi_a/chain/tdnn_lstm_1a_sp_online/decode_eval2000_fg/score_8_0.0/eval2000_hires.ctm.callhm.filt.sys |
| 32 | +# %WER 11.2 | 4459 42989 | 90.5 6.9 2.6 1.7 11.2 45.4 | exp/multi_a/chain/tdnn_lstm_1a_sp_online/decode_eval2000_fg/score_8_0.0/eval2000_hires.ctm.filt.sys |
| 33 | +# %WER 8.1 | 1831 21395 | 93.1 5.1 1.8 1.2 8.1 40.6 | exp/multi_a/chain/tdnn_lstm_1a_sp_online/decode_eval2000_fg/score_9_0.0/eval2000_hires.ctm.swbd.filt.sys |
| 34 | + |
| 35 | +# RT03 |
| 36 | +# %WER 8.7 | 3970 36721 | 92.2 5.3 2.5 1.0 8.7 37.3 | exp/multi_a/chain/tdnn_lstm_1a_sp_online/decode_rt03/score_7_0.0/rt03_hires.ctm.fsh.filt.sys |
| 37 | +# %WER 10.8 | 8420 76157 | 90.4 6.5 3.2 1.2 10.8 40.1 | exp/multi_a/chain/tdnn_lstm_1a_sp_online/decode_rt03/score_8_0.0/rt03_hires.ctm.filt.sys |
| 38 | +# %WER 12.7 | 4450 39436 | 88.7 7.7 3.6 1.4 12.7 42.5 | exp/multi_a/chain/tdnn_lstm_1a_sp_online/decode_rt03/score_8_0.0/rt03_hires.ctm.swbd.filt.sys |
| 39 | +# %WER 8.5 | 3970 36721 | 92.4 5.1 2.5 0.9 8.5 37.2 | exp/multi_a/chain/tdnn_lstm_1a_sp_online/decode_rt03_fg/score_7_1.0/rt03_hires.ctm.fsh.filt.sys |
| 40 | +# %WER 10.5 | 8420 76157 | 90.6 6.3 3.1 1.2 10.5 40.1 | exp/multi_a/chain/tdnn_lstm_1a_sp_online/decode_rt03_fg/score_8_0.0/rt03_hires.ctm.filt.sys |
| 41 | +# %WER 12.4 | 4450 39436 | 88.9 7.2 3.9 1.3 12.4 42.7 | exp/multi_a/chain/tdnn_lstm_1a_sp_online/decode_rt03_fg/score_9_0.0/rt03_hires.ctm.swbd.filt.sys |
| 42 | + |
| 43 | +set -e |
| 44 | + |
| 45 | +# configs for 'chain' |
| 46 | +stage=-10 |
| 47 | +train_stage=-10 |
| 48 | +get_egs_stage=-10 |
| 49 | +speed_perturb=true |
| 50 | +multi=multi_a |
| 51 | +gmm=tri5a |
| 52 | +decode_iter= |
| 53 | +decode_dir_affix= |
| 54 | +decode_nj=50 |
| 55 | + |
| 56 | +# training options |
| 57 | +frames_per_chunk=140,100,160 |
| 58 | +frames_per_chunk_primary=$(echo $frames_per_chunk | cut -d, -f1) |
| 59 | +chunk_left_context=40 |
| 60 | +chunk_right_context=0 |
| 61 | +xent_regularize=0.025 |
| 62 | +self_repair_scale=0.00001 |
| 63 | +label_delay=5 |
| 64 | +# decode options |
| 65 | +extra_left_context=50 |
| 66 | +extra_right_context=0 |
| 67 | +dropout_schedule='0,0@0.20,0.3@0.50,0' |
| 68 | +num_epochs=4 |
| 69 | + |
| 70 | +remove_egs=false |
| 71 | +common_egs_dir= |
| 72 | + |
| 73 | +test_online_decoding=true # if true, it will run the last decoding stage. |
| 74 | + |
| 75 | +nnet3_affix= |
| 76 | +tdnn_affix=_1a |
| 77 | + |
| 78 | +# End configuration section. |
| 79 | +echo "$0 $@" # Print the command line for logging |
| 80 | + |
| 81 | +. ./cmd.sh |
| 82 | +. ./path.sh |
| 83 | +. ./utils/parse_options.sh |
| 84 | + |
| 85 | +if ! cuda-compiled; then |
| 86 | + cat <<EOF && exit 1 |
| 87 | +This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA |
| 88 | +If you want to use GPUs (and have them), go to src/, and configure and make on a machine |
| 89 | +where "nvcc" is installed. |
| 90 | +EOF |
| 91 | +fi |
| 92 | + |
| 93 | +# The iVector-extraction and feature-dumping parts are the same as the standard |
| 94 | +# nnet3 setup, and you can skip them by setting "--stage 8" if you have already |
| 95 | +# run those things. |
| 96 | + |
| 97 | +suffix= |
| 98 | +if [ "$speed_perturb" == "true" ]; then |
| 99 | + suffix=_sp |
| 100 | +fi |
| 101 | + |
| 102 | +dir=exp/$multi/chain/tdnn_lstm${tdnn_affix}${suffix} |
| 103 | +train_set=${multi}/${gmm}${suffix} |
| 104 | +lats_dir=exp/${multi}/${gmm}_lats_nodup${suffix} |
| 105 | +treedir=exp/$multi/chain/${gmm}_tree |
| 106 | +lang=data/${multi}/lang_${gmm}_chain |
| 107 | +lang_dir=data/lang_${multi}_${gmm}_fsh_sw1_tg |
| 108 | +rescore_lang_dir=data/lang_${multi}_${gmm}_fsh_sw1_fg |
| 109 | + |
| 110 | +local/nnet3/run_ivector_common.sh --stage $stage --nnet3-affix "$nnet3_affix" \ |
| 111 | + --multi $multi \ |
| 112 | + --gmm $gmm \ |
| 113 | + --speed-perturb $speed_perturb || exit 1 |
| 114 | + |
| 115 | +online_ivector_dir=exp/$multi/nnet3${nnet3_affix}/ivectors_${train_set} |
| 116 | + |
| 117 | +if [ $stage -le 9 ]; then |
| 118 | + steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" \ |
| 119 | + --generate-ali-from-lats true data/$train_set \ |
| 120 | + data/lang_${multi}_${gmm} exp/${multi}/$gmm $lats_dir |
| 121 | + rm ${lats_dir}/fsts.*.gz # save space |
| 122 | +fi |
| 123 | + |
| 124 | +if [ $stage -le 10 ]; then |
| 125 | + # Create a version of the lang/ directory that has one state per phone in the |
| 126 | + # topo file. [note, it really has two states.. the first one is only repeated |
| 127 | + # once, the second one has zero or more repeats.] |
| 128 | + if [ -d $lang ]; then |
| 129 | + echo "$lang exists. Remove it or skip this stage." |
| 130 | + exit 1 |
| 131 | + fi |
| 132 | + |
| 133 | + cp -r data/lang_${multi}_${gmm} $lang |
| 134 | + silphonelist=$(cat $lang/phones/silence.csl) || exit 1; |
| 135 | + nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1; |
| 136 | + # Use our special topology... note that later on may have to tune this |
| 137 | + # topology. |
| 138 | + steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo |
| 139 | +fi |
| 140 | + |
| 141 | +if [ $stage -le 11 ]; then |
| 142 | + # Build a tree using our new topology. |
| 143 | + |
| 144 | + if [ -f $treedir/final.mdl ]; then |
| 145 | + echo "$treedir exists. Remove it or skip this stage." |
| 146 | + exit 1 |
| 147 | + fi |
| 148 | + |
| 149 | + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ |
| 150 | + --context-opts "--context-width=2 --central-position=1" \ |
| 151 | + --cmd "$train_cmd" 7000 data/$train_set $lang $lats_dir $treedir |
| 152 | +fi |
| 153 | + |
| 154 | +if [ $stage -le 12 ]; then |
| 155 | + echo "$0: creating neural net configs using the xconfig parser"; |
| 156 | + |
| 157 | + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') |
| 158 | + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) |
| 159 | + lstm_opts="dropout-proportion=0.0 decay-time=40" |
| 160 | + |
| 161 | + relu_dim=1024 |
| 162 | + cell_dim=1024 |
| 163 | + projection_dim=256 |
| 164 | + |
| 165 | + mkdir -p $dir/configs |
| 166 | + cat <<EOF > $dir/configs/network.xconfig |
| 167 | + input dim=100 name=ivector |
| 168 | + input dim=40 name=input |
| 169 | +
|
| 170 | + # please note that it is important to have input layer with the name=input |
| 171 | + # as the layer immediately preceding the fixed-affine-layer to enable |
| 172 | + # the use of short notation for the descriptor |
| 173 | + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat |
| 174 | +
|
| 175 | + # the first splicing is moved before the lda layer, so no splicing here |
| 176 | + relu-batchnorm-layer name=tdnn1 dim=$relu_dim |
| 177 | + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$relu_dim |
| 178 | + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$relu_dim |
| 179 | +
|
| 180 | + # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults |
| 181 | + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 $lstm_opts |
| 182 | + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$relu_dim |
| 183 | + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$relu_dim |
| 184 | + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 $lstm_opts |
| 185 | + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$relu_dim |
| 186 | + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$relu_dim |
| 187 | + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 $lstm_opts |
| 188 | +
|
| 189 | + ## adding the layers for chain branch |
| 190 | + output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 |
| 191 | +
|
| 192 | + # adding the layers for xent branch |
| 193 | + # This block prints the configs for a separate output that will be |
| 194 | + # trained with a cross-entropy objective in the 'chain' models... this |
| 195 | + # has the effect of regularizing the hidden parts of the model. we use |
| 196 | + # 0.5 / args.xent_regularize as the learning rate factor- the factor of |
| 197 | + # 0.5 / args.xent_regularize is suitable as it means the xent |
| 198 | + # final-layer learns at a rate independent of the regularization |
| 199 | + # constant; and the 0.5 was tuned so as to make the relative progress |
| 200 | + # similar in the xent and regular final layers. |
| 201 | + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 |
| 202 | +
|
| 203 | +EOF |
| 204 | + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ |
| 205 | +fi |
| 206 | + |
| 207 | +if [ $stage -le 13 ]; then |
| 208 | + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then |
| 209 | + utils/create_split_dir.pl \ |
| 210 | + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/multi-en-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage |
| 211 | + fi |
| 212 | + |
| 213 | + steps/nnet3/chain/train.py --stage $train_stage \ |
| 214 | + --cmd "$decode_cmd" \ |
| 215 | + --feat.online-ivector-dir exp/$multi/nnet3${nnet3_affix}/ivectors_${train_set} \ |
| 216 | + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ |
| 217 | + --chain.xent-regularize $xent_regularize \ |
| 218 | + --chain.leaky-hmm-coefficient 0.1 \ |
| 219 | + --chain.l2-regularize 0.00005 \ |
| 220 | + --chain.apply-deriv-weights false \ |
| 221 | + --chain.lm-opts="--num-extra-lm-states=2000" \ |
| 222 | + --trainer.num-chunk-per-minibatch 64,32 \ |
| 223 | + --trainer.frames-per-iter 1500000 \ |
| 224 | + --trainer.max-param-change 2.0 \ |
| 225 | + --trainer.num-epochs $num_epochs \ |
| 226 | + --trainer.optimization.shrink-value 0.99 \ |
| 227 | + --trainer.optimization.num-jobs-initial 3 \ |
| 228 | + --trainer.optimization.num-jobs-final 16 \ |
| 229 | + --trainer.optimization.initial-effective-lrate 0.001 \ |
| 230 | + --trainer.optimization.final-effective-lrate 0.0001 \ |
| 231 | + --trainer.dropout-schedule=$dropout_schedule \ |
| 232 | + --trainer.optimization.momentum 0.0 \ |
| 233 | + --trainer.deriv-truncate-margin 8 \ |
| 234 | + --egs.stage $get_egs_stage \ |
| 235 | + --egs.opts "--frames-overlap-per-eg 0" \ |
| 236 | + --egs.chunk-width $frames_per_chunk \ |
| 237 | + --egs.chunk-left-context $chunk_left_context \ |
| 238 | + --egs.chunk-right-context $chunk_right_context \ |
| 239 | + --egs.chunk-left-context-initial 0 \ |
| 240 | + --egs.chunk-right-context-final 0 \ |
| 241 | + --egs.dir "$common_egs_dir" \ |
| 242 | + --cleanup.remove-egs $remove_egs \ |
| 243 | + --feat-dir data/${train_set}_hires \ |
| 244 | + --tree-dir $treedir \ |
| 245 | + --lat-dir $lats_dir \ |
| 246 | + --dir $dir || exit 1; |
| 247 | +fi |
| 248 | + |
| 249 | +lang_suffix=${lang_dir##*lang} |
| 250 | + |
| 251 | +if [ $stage -le 14 ]; then |
| 252 | + # Note: it might appear that this $lang directory is mismatched, and it is as |
| 253 | + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from |
| 254 | + # the lang directory. |
| 255 | + utils/mkgraph.sh --self-loop-scale 1.0 $lang_dir \ |
| 256 | + $dir $dir/graph${lang_suffix} |
| 257 | +fi |
| 258 | + |
| 259 | +graph_dir=$dir/graph${lang_suffix} |
| 260 | +if [ $stage -le 15 ]; then |
| 261 | + iter_opts= |
| 262 | + [ -z $extra_left_context ] && extra_left_context=$chunk_left_context; |
| 263 | + [ -z $extra_right_context ] && extra_right_context=$chunk_right_context; |
| 264 | + if [ ! -z $decode_iter ]; then |
| 265 | + iter_opts=" --iter $decode_iter " |
| 266 | + fi |
| 267 | + for decode_set in eval2000 rt03; do |
| 268 | + ( |
| 269 | + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ |
| 270 | + --nj 50 --cmd "$decode_cmd" $iter_opts \ |
| 271 | + --extra-left-context $extra_left_context \ |
| 272 | + --extra-right-context $extra_right_context \ |
| 273 | + --extra-left-context-initial 0 \ |
| 274 | + --extra-right-context-final 0 \ |
| 275 | + --frames-per-chunk "$frames_per_chunk_primary" \ |
| 276 | + --online-ivector-dir exp/$multi/nnet3${nnet3_affix}/ivectors_${decode_set} \ |
| 277 | + $graph_dir data/${decode_set}_hires \ |
| 278 | + $dir/decode${lang_suffix}_${decode_set}${decode_dir_affix:+_$decode_dir_affix}${decode_iter:+_iter$decode_iter} |
| 279 | + |
| 280 | + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ |
| 281 | + $lang_dir $rescore_lang_dir data/${decode_set}_hires \ |
| 282 | + $dir/decode${lang_suffix}_${decode_set}${decode_dir_affix:+_$decode_dir_affix}{,_fg}${decode_iter:+_iter$decode_iter} || exit 1; |
| 283 | + ) & |
| 284 | + done |
| 285 | +fi |
| 286 | +wait; |
| 287 | + |
| 288 | +if $test_online_decoding && [ $stage -le 16 ]; then |
| 289 | + # note: if the features change (e.g. you add pitch features), you will have to |
| 290 | + # change the options of the following command line. |
| 291 | + steps/online/nnet3/prepare_online_decoding.sh \ |
| 292 | + --mfcc-config conf/mfcc_hires.conf \ |
| 293 | + $lang exp/nnet3/extractor $dir ${dir}_online |
| 294 | + |
| 295 | + rm $dir/.error 2>/dev/null || true |
| 296 | + for decode_set in train_dev eval2000; do |
| 297 | + ( |
| 298 | + # note: we just give it "$decode_set" as it only uses the wav.scp, the |
| 299 | + # feature type does not matter. |
| 300 | + |
| 301 | + steps/online/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" $iter_opts \ |
| 302 | + --acwt 1.0 --post-decode-acwt 10.0 \ |
| 303 | + $graph_dir data/${decode_set}_hires \ |
| 304 | + ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1; |
| 305 | + if $has_fisher; then |
| 306 | + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ |
| 307 | + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ |
| 308 | + ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1; |
| 309 | + fi |
| 310 | + ) || touch $dir/.error & |
| 311 | + done |
| 312 | + wait |
| 313 | + if [ -f $dir/.error ]; then |
| 314 | + echo "$0: something went wrong in online decoding" |
| 315 | + exit 1 |
| 316 | + fi |
| 317 | +fi |
| 318 | + |
| 319 | +exit 0; |
0 commit comments