From 1279778ca16acd9d62b42c4dab65d0d66c49e754 Mon Sep 17 00:00:00 2001 From: LvHang Date: Sat, 15 Sep 2018 20:42:15 -0400 Subject: [PATCH] Add GruNonlinearityComponent(by Dan) and OutputGruNonlinearityComponent; moving aroun some sources in nnet3 to avoid very large files rename nnet-combined-component.{h,cc} and str case Update get_saturation.pl for fast gru version. Get matched resutls --- .../local/chain/tuning/run_tdnn_opgru_1a.sh | 25 +- .../local/chain/tuning/run_tdnn_opgru_1b.sh | 315 +++ egs/wsj/s5/steps/libs/nnet3/xconfig/gru.py | 1080 +++++++- egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py | 5 + egs/wsj/s5/steps/nnet3/get_saturation.pl | 8 + src/nnet3/Makefile | 2 +- src/nnet3/nnet-combined-component.cc | 2332 +++++++++++++++++ src/nnet3/nnet-combined-component.h | 1109 ++++++++ src/nnet3/nnet-component-itf.cc | 5 + src/nnet3/nnet-simple-component.cc | 1300 +-------- src/nnet3/nnet-simple-component.h | 530 +--- src/nnet3/nnet-test-utils.cc | 18 +- 12 files changed, 4889 insertions(+), 1840 deletions(-) create mode 100755 egs/swbd/s5c/local/chain/tuning/run_tdnn_opgru_1b.sh create mode 100644 src/nnet3/nnet-combined-component.cc create mode 100644 src/nnet3/nnet-combined-component.h diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_opgru_1a.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_opgru_1a.sh index b1426bc22b7..18d3f81ffde 100755 --- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_opgru_1a.sh +++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_opgru_1a.sh @@ -4,31 +4,36 @@ # This is based on TDNN_LSTM_1b, but using the NormOPGRU to replace the LSTMP, # and adding chunk-{left,right}-context-initial=0 +# For the details of OPGRU structure, please check the paper +# "Output-Gate Projected Gated Recurrent Unit for Speech Recognition" +# by Gaofeng Cheng et al, +# http://www.danielpovey.com/files/2018_interspeech_opgru.pdf + # Different from the vanilla OPGRU, Norm-OPGRU adds batchnorm in its output (forward direction) # and renorm in its recurrence. Experiments show that the TDNN-NormOPGRU could achieve similar # results than TDNN-LSTMP and BLSTMP in both large or small data sets (80 ~ 2300 Hrs). # ./local/chain/compare_wer_general.sh --looped tdnn_lstm_1e_sp tdnn_opgru_1a_sp # System tdnn_lstm_1e_sp tdnn_opgru_1a_sp -# WER on train_dev(tg) 12.81 12.39 -# [looped:] 12.93 12.32 -# WER on train_dev(fg) 11.92 11.39 -# [looped:] 12.07 11.35 +# WER on train_dev(tg) 12.81 12.31 +# [looped:] 12.93 12.26 +# WER on train_dev(fg) 11.92 11.60 +# [looped:] 12.07 11.65 # WER on eval2000(tg) 15.6 15.1 # [looped:] 16.0 15.1 -# WER on eval2000(fg) 14.1 13.6 +# WER on eval2000(fg) 14.1 13.5 # [looped:] 14.5 13.5 -# Final train prob -0.065 -0.066 -# Final valid prob -0.087 -0.085 -# Final train prob (xent) -0.918 -0.889 -# Final valid prob (xent) -1.0309 -0.9837 +# Final train prob -0.065 -0.068 +# Final valid prob -0.087 -0.091 +# Final train prob (xent) -0.918 -0.879 +# Final valid prob (xent) -1.0309 -0.9667 set -e # configs for 'chain' -stage=12 +stage=0 train_stage=-10 get_egs_stage=-10 speed_perturb=true diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_opgru_1b.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_opgru_1b.sh new file mode 100755 index 00000000000..579008b5658 --- /dev/null +++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_opgru_1b.sh @@ -0,0 +1,315 @@ +#!/bin/bash +# Apache 2.0 + +# This is based on TDNN_OPGRU_1A, but using the FastNormOPGRU to replace the NormPGRU. +# For the details of OPGRU structure, please check the paper +# "Output-Gate Projected Gated Recurrent Unit for Speech Recognition" +# by Gaofeng Cheng et al, +# http://www.danielpovey.com/files/2018_interspeech_opgru.pdf + +# Different from the vanilla OPGRU, Norm-OPGRU adds batchnorm in its output (forward direction) +# and renorm in its recurrence. Experiments show that the TDNN-NormOPGRU could achieve similar +# results than TDNN-LSTMP and BLSTMP in both large or small data sets (80 ~ 2300 Hrs). + +# ./local/chain/compare_wer_general.sh --looped tdnn_opgru_1a_sp tdnn_opgru_1b_sp +# System tdnn_opgru_1a_sp tdnn_opgru_1b_sp +# WER on train_dev(tg) 12.31 12.41 +# [looped:] 12.26 12.38 +# WER on train_dev(fg) 11.49 11.60 +# [looped:] 11.43 11.65 +# WER on eval2000(tg) 14.9 15.1 +# [looped:] 15.0 15.1 +# WER on eval2000(fg) 13.5 13.7 +# [looped:] 13.5 13.7 +# Final train prob -0.068 -0.070 +# Final valid prob -0.091 -0.092 +# Final train prob (xent) -0.879 -0.889 +# Final valid prob (xent) -0.9667 -0.9723 + + + +set -e + +# configs for 'chain' +stage=0 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_opgru_1b # Note: _sp will get added to this if $speed_perturb == true. +decode_iter= +decode_dir_affix= + +# training options +leftmost_questions_truncate=-1 +chunk_width=150 +chunk_left_context=40 +chunk_right_context=0 +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=5 +dropout_schedule='0,0@0.20,0.2@0.50,0' +# decode options +extra_left_context=50 +extra_right_context=0 +frames_per_chunk= +test_online_decoding= + +remove_egs=false +common_egs_dir= + +affix= +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + gru_opts="dropout-per-frame=true dropout-proportion=0.0 gru-nonlinearity-options=\"max-change=0.75\"" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=1024 + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024 + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024 + + # check steps/libs/nnet3/xconfig/gru.py for the other options and defaults + fast-norm-opgru-layer name=opgru1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $gru_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024 + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024 + fast-norm-opgru-layer name=opgru2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $gru_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024 + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024 + fast-norm-opgru-layer name=opgru3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $gru_opts + + ## adding the layers for chain branch + output-layer name=output input=opgru3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=opgru3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.num-chunk-per-minibatch 64 \ + --trainer.frames-per-iter 1200000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs 4 \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $chunk_width \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --trainer.dropout-schedule $dropout_schedule \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri4_lats_nodup$suffix \ + --dir $dir || exit 1; +fi + +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 15 ]; then + [ -z $extra_left_context ] && extra_left_context=$chunk_left_context; + [ -z $extra_right_context ] && extra_right_context=$chunk_right_context; + [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width; + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" $iter_opts \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk "$frames_per_chunk" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi + +if $test_online_decoding && [ $stage -le 16 ]; then + # note: if the features change (e.g. you add pitch features), you will have to + # change the options of the following command line. + steps/online/nnet3/prepare_online_decoding.sh \ + --mfcc-config conf/mfcc_hires.conf \ + $lang exp/nnet3/extractor $dir ${dir}_online + + rm $dir/.error 2>/dev/null || true + for decode_set in train_dev eval2000; do + ( + # note: we just give it "$decode_set" as it only uses the wav.scp, the + # feature type does not matter. + steps/online/nnet3/decode.sh --nj 50 --cmd "$decode_cmd" $iter_opts \ + --acwt 1.0 --post-decode-acwt 10.0 \ + $graph_dir data/${decode_set}_hires \ + ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1; + fi + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in online decoding" + exit 1 + fi +fi + +if [ $stage -le 17 ]; then + rm $dir/.error 2>/dev/null || true + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode_looped.sh \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg_looped || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg}_looped || exit 1; + fi + ) & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in looped decoding" + exit 1 + fi +fi + +wait; +exit 0; diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/gru.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/gru.py index 530ba14474a..2f387a6a1e5 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/gru.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/gru.py @@ -1,6 +1,7 @@ # Copyright 2016 Johns Hopkins University (Dan Povey) # 2017 Gaofeng Cheng (UCAS) # 2017 Lu Huang (THU) +# 2018 Hang Lyu # Apache 2.0. @@ -83,7 +84,7 @@ def get_full_config(self): ans.append((config_name, line)) return ans - # convenience function to generate the LSTM config + # convenience function to generate the GRU config def generate_gru_config(self): # assign some variables to reduce verbosity @@ -468,7 +469,7 @@ def output_name(self, auxiliary_output = None): def output_dim(self, auxiliary_output = None): if auxiliary_output is not None: if auxiliary_output in self.auxiliary_outputs(): - if node_name == 'c_t': + if node_name == 'h_t': return self.config['cell-dim'] # add code for other auxiliary_outputs here when we decide to expose them else: @@ -487,7 +488,7 @@ def get_full_config(self): ans.append((config_name, line)) return ans - # convenience function to generate the PGRU config + # convenience function to generate the Norm-PGRU config def generate_pgru_config(self): # assign some variables to reduce verbosity @@ -711,7 +712,7 @@ def get_full_config(self): ans.append((config_name, line)) return ans - # convenience function to generate the PGRU config + # convenience function to generate the OPGRU config def generate_pgru_config(self): # assign some variables to reduce verbosity @@ -922,7 +923,7 @@ def get_full_config(self): ans.append((config_name, line)) return ans - # convenience function to generate the PGRU config + # convenience function to generate the Norm-OPGRU config def generate_pgru_config(self): # assign some variables to reduce verbosity @@ -1039,3 +1040,1072 @@ def generate_pgru_config(self): configs.append("component-node name={0}.s_t component={0}.s_r input={0}.s_t_preclip_renorm".format(name)) return configs + +# This class is for lines like +# 'fast-gru-layer name=gru1 input=[-1] delay=-3' +# It generates an GRU sub-graph without output projections. +# The output dimension of the layer may be specified via 'cell-dim=xxx', but if not specified, +# the dimension defaults to the same as the input. +# See other configuration values below. +# decay-time is deprecated under GRU or PGRU, as I found the PGRUs do not need the decay-time option to get generalized to unseen sequence length +# +# Parameters of the class, and their defaults: +# input='[-1]' [Descriptor giving the input of the layer.] +# cell-dim=-1 [Dimension of the cell] +# delay=-1 [Delay in the recurrent connections of the GRU/LSTM ] +# clipping-threshold=30 [similar to LSTMs ,nnet3 GRUs use a gradient clipping component at the recurrent connections. +# This is the threshold used to decide if clipping has to be activated ] +# zeroing-interval=20 [interval at which we (possibly) zero out the recurrent derivatives.] +# zeroing-threshold=15 [We only zero out the derivs every zeroing-interval, if derivs exceed this value.] +# self-repair-scale-nonlinearity=1e-5 [It is a constant scaling the self-repair vector computed in derived classes of NonlinearComponent] +# i.e., SigmoidComponent, TanhComponent and RectifiedLinearComponent ] +# ng-per-element-scale-options='' [Additional options used for the diagonal matrices in the GRU/LSTM ] +# gru-nonlinearity-options=' max-change=0.75' [options for GruNonlinearityComponent, see below for detail] +# ng-affine-options='' [Additional options used for the full matrices in the GRU/LSTM, can be used to do things like set biases to initialize to 1] +class XconfigFastGruLayer(XconfigLayerBase): + def __init__(self, first_token, key_to_value, prev_names = None): + assert first_token == "fast-gru-layer" + XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) + + def set_default_configs(self): + self.config = {'input':'[-1]', + 'cell-dim' : -1, # this is a compulsory argument + 'clipping-threshold' : 30.0, + 'delay' : -1, + 'ng-per-element-scale-options' : ' max-change=0.75', + 'ng-affine-options' : ' max-change=0.75 ', + 'self-repair-scale-nonlinearity' : 0.00001, + 'zeroing-interval' : 20, + 'zeroing-threshold' : 15.0, + # if you want to set 'self-repair-scale', ' self-repair-threshold' + # or 'param-stddev' for GruNonlinearityComponent + # For default, they are 1.0e-05, 0.2 and 1.0 / sqrt(d) where d is cell-dim. + # you can add somethig like 'self-repair-scale=xxx' to gru-nonlinearity-options. + # you can also see src/nnet3/nnet-combined-component.h for detail + 'gru-nonlinearity-options' : ' max-change=0.75' + } + + def set_derived_configs(self): + if self.config['cell-dim'] <= 0: + self.config['cell-dim'] = self.descriptors['input']['dim'] + + def check_configs(self): + key = 'cell-dim' + if self.config['cell-dim'] <= 0: + raise RuntimeError("cell-dim has invalid value {0}.".format(self.config[key])) + + if self.config['delay'] == 0: + raise RuntimeError("delay cannot be zero") + + for key in ['self-repair-scale-nonlinearity']: + if self.config[key] < 0.0 or self.config[key] > 1.0: + raise RuntimeError("{0} has invalid value {1}.".format(key, self.config[key])) + + def output_name(self, auxiliary_output = None): + node_name = 'y_t' + return '{0}.{1}'.format(self.name, node_name) + + def output_dim(self, auxiliary_output = None): + return self.config['cell-dim'] + + def get_full_config(self): + ans = [] + config_lines = self.generate_gru_config() + + for line in config_lines: + for config_name in ['ref', 'final']: + # we do not support user specified matrices in LSTM initialization + # so 'ref' and 'final' configs are the same. + ans.append((config_name, line)) + return ans + + # convenience function to generate the GRU config + def generate_gru_config(self): + + # assign some variables to reduce verbosity + name = self.name + # in the below code we will just call descriptor_strings as descriptors for conciseness + input_dim = self.descriptors['input']['dim'] + input_descriptor = self.descriptors['input']['final-string'] + cell_dim = self.config['cell-dim'] + delay = self.config['delay'] + bptrunc_str = ("clipping-threshold={0}" + " zeroing-threshold={1}" + " zeroing-interval={2}" + " recurrence-interval={3}" + "".format(self.config['clipping-threshold'], + self.config['zeroing-threshold'], + self.config['zeroing-interval'], abs(delay))) + repair_nonlin = self.config['self-repair-scale-nonlinearity'] + repair_nonlin_str = "self-repair-scale={0:.10f}".format(repair_nonlin) if repair_nonlin is not None else '' + affine_str = self.config['ng-affine-options'] + + # string for GruNonlinearityComponent + gru_nonlin_str = self.config['gru-nonlinearity-options'] + + # formulation like: + # z_t = \sigmoid ( U^z x_t + W^z y_{t-1} ) # update gate + # r_t = \sigmoid ( U^r x_t + W^r y_{t-1} ) # reset gate + # h_t = \tanh ( U^h x_t + W^h ( y_{t-1} \dot r_t ) ) + # y_t = ( 1 - z_t ) \dot h_t + z_t \dot y_{t-1} + # Note: + # naming convention: + # .W_. e.g. Gru1.W_i.xr for matrix + # providing output to gate i and operating on an appended vector [x,r] + # notation convention: + # In order to be consistent with the notations which are used in + # nnet-combined-component.cc, we map "\tilde{h_t}" and "h_t" which are + # used in paper to "h_t" and "c_t" + + configs = [] + + configs.append("### Begin Gru layer '{0}'".format(name)) + configs.append("# Update gate control : W_z* matrices") + configs.append("component name={0}.W_z.xh type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + cell_dim, cell_dim, affine_str)) + configs.append("# Reset gate control : W_r* matrices") + configs.append("component name={0}.W_r.xh type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + cell_dim, cell_dim, affine_str)) + + configs.append("# hpart_t related matrix : W_hpart matrice") + configs.append("component name={0}.W_hpart.x type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim, cell_dim , affine_str)) + + configs.append("# Defining the non-linearities for z_t and r_t") + configs.append("component name={0}.z type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str)) + configs.append("component name={0}.r type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str)) + + recurrent_connection = '{0}.s_t'.format(name) + + configs.append("# z_t") + configs.append("component-node name={0}.z_t_pre component={0}.W_z.xh input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay)) + configs.append("component-node name={0}.z_t component={0}.z input={0}.z_t_pre".format(name)) + configs.append("# r_t") + configs.append("component-node name={0}.r_t_pre component={0}.W_r.xh input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay)) + configs.append("component-node name={0}.r_t component={0}.r input={0}.r_t_pre".format(name)) + + configs.append("# hpart_t") + configs.append("component-node name={0}.hpart_t component={0}.W_hpart.x input={1}".format(name, input_descriptor)) + + configs.append("# y_t") + configs.append("# Note: the output of GruNonlinearityComponent is (h_t, c_t), we just get the second half. Otherwise, in non-projection gru layer, y_t = c_t") + configs.append("component name={0}.gru_nonlin type=GruNonlinearityComponent cell-dim={1} {2}".format(name, cell_dim, gru_nonlin_str)) + configs.append("component-node name={0}.gru_nonlin_t component={0}.gru_nonlin input=Append({0}.z_t, {0}.r_t, {0}.hpart_t, IfDefined(Offset({1}, {2})))".format(name, recurrent_connection, delay)) + configs.append("dim-range-node name={0}.y_t input-node={0}.gru_nonlin_t dim-offset={1} dim={1}".format(name, cell_dim)) + + configs.append("# s_t : recurrence") + configs.append("# Note: in non-projection gru layer, the recurrent part equals the output, namely y_t.") + configs.append("component name={0}.s_r type=BackpropTruncationComponent dim={1} {2}".format(name, cell_dim, bptrunc_str)) + configs.append("component-node name={0}.s_t component={0}.s_r input={0}.y_t".format(name)) + return configs + + +# This class is for lines like +# 'fast-pgru-layer name=pgru1 input=[-1] delay=-3' +# It generates an PGRU sub-graph with output projections. It can also generate +# outputs without projection, but you could use the XconfigGruLayer for this +# simple RNN. +# The output dimension of the layer may be specified via 'cell-dim=xxx', but if not specified, +# the dimension defaults to the same as the input. +# See other configuration values below. +# +# Parameters of the class, and their defaults: +# input='[-1]' [Descriptor giving the input of the layer.] +# cell-dim=-1 [Dimension of the cell] +# recurrent-projection_dim [Dimension of the projection used in recurrent connections, e.g. cell-dim/4] +# non-recurrent-projection-dim [Dimension of the projection in non-recurrent connections, +# in addition to recurrent-projection-dim, e.g. cell-dim/4] +# delay=-1 [Delay in the recurrent connections of the GRU ] +# clipping-threshold=30 [nnet3 GRU use a gradient clipping component at the recurrent connections. +# This is the threshold used to decide if clipping has to be activated ] +# zeroing-interval=20 [interval at which we (possibly) zero out the recurrent derivatives.] +# zeroing-threshold=15 [We only zero out the derivs every zeroing-interval, if derivs exceed this value.] +# self_repair_scale_nonlinearity=1e-5 [It is a constant scaling the self-repair vector computed in derived classes of NonlinearComponent] +# i.e., SigmoidComponent, TanhComponent and RectifiedLinearComponent ] +# ng-per-element-scale-options='' [Additional options used for the diagonal matrices in the GRU ] +# gru-nonlinearity-options=' max-change=0.75' [options for GruNonlinearityComponent, see below for detail] +# ng-affine-options='' [Additional options used for the full matrices in the GRU, can be used to do things like set biases to initialize to 1] +class XconfigFastPgruLayer(XconfigLayerBase): + def __init__(self, first_token, key_to_value, prev_names = None): + assert first_token == "fast-pgru-layer" + XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) + + def set_default_configs(self): + self.config = {'input' : '[-1]', + 'cell-dim' : -1, # this is a compulsory argument + 'recurrent-projection-dim' : -1, # defaults to cell-dim / 4 + 'non-recurrent-projection-dim' : -1, # defaults to + # recurrent-projection-dim + 'clipping-threshold' : 30.0, + 'delay' : -1, + 'ng-per-element-scale-options' : ' max-change=0.75 ', + 'ng-affine-options' : ' max-change=0.75 ', + 'self-repair-scale-nonlinearity' : 0.00001, + 'zeroing-interval' : 20, + 'zeroing-threshold' : 15.0, + # if you want to set 'self-repair-scale', ' self-repair-threshold' + # or 'param-stddev' for GruNonlinearityComponent + # For default, they are 1.0e-05, 0.2 and 1.0 / sqrt(d) where d is cell-dim. + # you can add somethig like 'self-repair-scale=xxx' to gru-nonlinearity-options. + # you can also see src/nnet3/nnet-combined-component.h for detail + 'gru-nonlinearity-options' : ' max-change=0.75' + } + + def set_derived_configs(self): + if self.config['recurrent-projection-dim'] <= 0: + self.config['recurrent-projection-dim'] = self.config['cell-dim'] / 4 + + if self.config['non-recurrent-projection-dim'] <= 0: + self.config['non-recurrent-projection-dim'] = \ + self.config['recurrent-projection-dim'] + + def check_configs(self): + for key in ['cell-dim', 'recurrent-projection-dim', + 'non-recurrent-projection-dim']: + if self.config[key] <= 0: + raise RuntimeError("{0} has invalid value {1}.".format( + key, self.config[key])) + + if self.config['delay'] == 0: + raise RuntimeError("delay cannot be zero") + + if (self.config['recurrent-projection-dim'] + + self.config['non-recurrent-projection-dim'] > + self.config['cell-dim']): + raise RuntimeError("recurrent+non-recurrent projection dim exceeds " + "cell dim.") + for key in ['self-repair-scale-nonlinearity']: + if self.config[key] < 0.0 or self.config[key] > 1.0: + raise RuntimeError("{0} has invalid value {2}." + .format(self.layer_type, key, + self.config[key])) + + def auxiliary_outputs(self): + return ['c_t'] + + def output_name(self, auxiliary_output = None): + node_name = 'y_t' + if auxiliary_output is not None: + if auxiliary_output in self.auxiliary_outputs(): + node_name = auxiliary_output + else: + raise Exception("In {0} of type {1}, unknown auxiliary output name {1}".format(self.layer_type, auxiliary_output)) + + return '{0}.{1}'.format(self.name, node_name) + + def output_dim(self, auxiliary_output = None): + if auxiliary_output is not None: + if auxiliary_output in self.auxiliary_outputs(): + if node_name == 'c_t': + return self.config['cell-dim'] + # add code for other auxiliary_outputs here when we decide to expose them + else: + raise Exception("In {0} of type {1}, unknown auxiliary output name {1}".format(self.layer_type, auxiliary_output)) + + return self.config['recurrent-projection-dim'] + self.config['non-recurrent-projection-dim'] + + def get_full_config(self): + ans = [] + config_lines = self.generate_pgru_config() + + for line in config_lines: + for config_name in ['ref', 'final']: + # we do not support user specified matrices in LSTM initialization + # so 'ref' and 'final' configs are the same. + ans.append((config_name, line)) + return ans + + # convenience function to generate the PGRU config + def generate_pgru_config(self): + + # assign some variables to reduce verbosity + name = self.name + # in the below code we will just call descriptor_strings as descriptors for conciseness + input_dim = self.descriptors['input']['dim'] + input_descriptor = self.descriptors['input']['final-string'] + cell_dim = self.config['cell-dim'] + rec_proj_dim = self.config['recurrent-projection-dim'] + nonrec_proj_dim = self.config['non-recurrent-projection-dim'] + delay = self.config['delay'] + repair_nonlin = self.config['self-repair-scale-nonlinearity'] + repair_nonlin_str = "self-repair-scale={0:.10f}".format(repair_nonlin) if repair_nonlin is not None else '' + bptrunc_str = ("clipping-threshold={0}" + " zeroing-threshold={1}" + " zeroing-interval={2}" + " recurrence-interval={3}" + "".format(self.config['clipping-threshold'], + self.config['zeroing-threshold'], + self.config['zeroing-interval'], + abs(delay))) + affine_str = self.config['ng-affine-options'] + pes_str = self.config['ng-per-element-scale-options'] + + # Natural gradient per element scale parameters + # TODO: decide if we want to keep exposing these options + if re.search('param-mean', pes_str) is None and \ + re.search('param-stddev', pes_str) is None: + pes_str += " param-mean=0.0 param-stddev=1.0 " + + # string for GruNonlinearityComponent + gru_nonlin_str = self.config['gru-nonlinearity-options'] + + # formulation like: + # z_t = \sigmoid ( U^z x_t + W^z s_{t-1} ) # update gate + # r_t = \sigmoid ( U^r x_t + W^r s_{t-1} ) # reset gate + # h_t = \tanh ( U^h x_t + W^h ( s_{t-1} \dot r_t ) ) + # c_t = ( 1 - z_t ) \dot h_t + z_t \dot c_{t-1} + # y_t = W^y c_t # dim(y_t) = recurrent_dim + non_recurrent_dim. + # This is the output of the GRU. + # s_t = y_t[0:recurrent_dim-1] # dimension range of y_t + # dim(s_t) = recurrent_dim. + # Note: + # naming convention: + # .W_. e.g. Gru1.W_i.xr for matrix + # providing output to gate i and operating on an appended vector [x,r] + # notation convention: + # In order to be consistent with the notations which are used in + # nnet-combined-component.cc, we map "\tilde{h_t}" and "h_t" which are + # used in paper to "h_t" and "c_t" + + configs = [] + configs.append("### Begin Gru layer '{0}'".format(name)) + configs.append("# Update gate control : W_z* matrices") + configs.append("component name={0}.W_z.xs type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + rec_proj_dim, cell_dim, affine_str)) + configs.append("# Reset gate control : W_r* matrices") + configs.append("component name={0}.W_r.xs type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + rec_proj_dim, rec_proj_dim, affine_str)) + + + configs.append("# hpart_t related matrix : W_hpart matric") + configs.append("component name={0}.W_hpart.x type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim, cell_dim , affine_str)) + + configs.append("# Defining the non-linearities") + configs.append("component name={0}.z type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str)) + configs.append("component name={0}.r type=SigmoidComponent dim={1} {2}".format(name, rec_proj_dim, repair_nonlin_str)) + + recurrent_connection = '{0}.s_t'.format(name) + + configs.append("# z_t and r_t") + configs.append("component-node name={0}.z_t_pre component={0}.W_z.xs input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay)) + configs.append("component-node name={0}.z_t component={0}.z input={0}.z_t_pre".format(name)) + configs.append("component-node name={0}.r_t_pre component={0}.W_r.xs input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay)) + configs.append("component-node name={0}.r_t component={0}.r input={0}.r_t_pre".format(name)) + + configs.append("# hpart_t") + configs.append("component-node name={0}.hpart_t component={0}.W_hpart.x input={1}".format(name, input_descriptor)) + + configs.append("# c_t") + configs.append("# Note: the output of GruNonlinearityComponent is (h_t, c_t), we use the second half.") + configs.append("component name={0}.gru_nonlin type=GruNonlinearityComponent cell-dim={1} recurrent-dim={2} {3}".format(name, cell_dim, rec_proj_dim, gru_nonlin_str)) + configs.append("component-node name={0}.gru_nonlin_t component={0}.gru_nonlin input=Append({0}.z_t, {0}.r_t, {0}.hpart_t, IfDefined(Offset({0}.c_t, {2})), IfDefined(Offset({1}, {2})))".format(name, recurrent_connection, delay)) + configs.append("dim-range-node name={0}.c_t input-node={0}.gru_nonlin_t dim-offset={1} dim={1}".format(name, cell_dim)) + + configs.append("# the projected matrix W_y.c and y_t") + configs.append("component name={0}.W_y.c type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, cell_dim, rec_proj_dim + nonrec_proj_dim, affine_str)) + configs.append("component-node name={0}.y_t component={0}.W_y.c input={0}.c_t".format(name)) + + configs.append("# s_t : recurrence") + configs.append("component name={0}.s_r type=BackpropTruncationComponent dim={1} {2}".format(name, rec_proj_dim, bptrunc_str)) + configs.append("dim-range-node name={0}.s_t_pre input-node={0}.y_t dim-offset=0 dim={1}".format(name, rec_proj_dim)) + configs.append("component-node name={0}.s_t component={0}.s_r input={0}.s_t_pre".format(name)) + return configs + + +# This class is for lines like +# 'fast-norm-pgru-layer name=pgru1 input=[-1] delay=-3' + +# Different from the vanilla PGRU, the NormPGRU uses batchnorm in the forward direction +# and renorm in the recurrence. + +# The output dimension of the layer may be specified via 'cell-dim=xxx', but if not specified, +# the dimension defaults to the same as the input. +# See other configuration values below. +# +# Parameters of the class, and their defaults: +# input='[-1]' [Descriptor giving the input of the layer.] +# cell-dim=-1 [Dimension of the cell] +# recurrent-projection_dim [Dimension of the projection used in recurrent connections, e.g. cell-dim/4] +# non-recurrent-projection-dim [Dimension of the projection in non-recurrent connections, +# in addition to recurrent-projection-dim, e.g. cell-dim/4] +# delay=-1 [Delay in the recurrent connections of the GRU ] +# clipping-threshold=30 [nnet3 GRU use a gradient clipping component at the recurrent connections. +# This is the threshold used to decide if clipping has to be activated ] +# zeroing-interval=20 [interval at which we (possibly) zero out the recurrent derivatives.] +# zeroing-threshold=15 [We only zero out the derivs every zeroing-interval, if derivs exceed this value.] +# self_repair_scale_nonlinearity=1e-5 [It is a constant scaling the self-repair vector computed in derived classes of NonlinearComponent] +# i.e., SigmoidComponent, TanhComponent and RectifiedLinearComponent ] +# ng-per-element-scale-options='' [Additional options used for the diagonal matrices in the GRU ] +# gru-nonlinearity-options=' max-change=0.75' [options for GruNonlinearityComponent, see below for detail] +# ng-affine-options='' [Additional options used for the full matrices in the GRU, can be used to do things like set biases to initialize to 1] +class XconfigFastNormPgruLayer(XconfigLayerBase): + def __init__(self, first_token, key_to_value, prev_names = None): + assert first_token == "fast-norm-pgru-layer" + XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) + + def set_default_configs(self): + self.config = {'input' : '[-1]', + 'cell-dim' : -1, # this is a compulsory argument + 'recurrent-projection-dim' : -1, # defaults to cell-dim / 4 + 'non-recurrent-projection-dim' : -1, # defaults to + # recurrent-projection-dim + 'clipping-threshold' : 30.0, + 'delay' : -1, + 'ng-per-element-scale-options' : ' max-change=0.75 ', + 'ng-affine-options' : ' max-change=0.75 ', + 'self-repair-scale-nonlinearity' : 0.00001, + 'zeroing-interval' : 20, + 'zeroing-threshold' : 15.0, + # if you want to set 'self-repair-scale', ' self-repair-threshold' + # or 'param-stddev' for GruNonlinearityComponent + # For default, they are 1.0e-05, 0.2 and 1.0 / sqrt(d) where d is cell-dim. + # you can add somethig like 'self-repair-scale=xxx' to gru-nonlinearity-options. + # you can also see src/nnet3/nnet-combined-component.h for detail + 'gru-nonlinearity-options' : ' max-change=0.75', + 'dropout-proportion' : -1.0, # If -1.0, no dropout components will be added + 'dropout-per-frame' : True # If False, regular dropout, not per frame + } + + def set_derived_configs(self): + if self.config['recurrent-projection-dim'] <= 0: + self.config['recurrent-projection-dim'] = self.config['cell-dim'] / 4 + + if self.config['non-recurrent-projection-dim'] <= 0: + self.config['non-recurrent-projection-dim'] = \ + self.config['recurrent-projection-dim'] + + def check_configs(self): + for key in ['cell-dim', 'recurrent-projection-dim', + 'non-recurrent-projection-dim']: + if self.config[key] <= 0: + raise RuntimeError("{0} has invalid value {1}.".format( + key, self.config[key])) + + if self.config['delay'] == 0: + raise RuntimeError("delay cannot be zero") + + if (self.config['recurrent-projection-dim'] + + self.config['non-recurrent-projection-dim'] > + self.config['cell-dim']): + raise RuntimeError("recurrent+non-recurrent projection dim exceeds " + "cell dim.") + for key in ['self-repair-scale-nonlinearity']: + if self.config[key] < 0.0 or self.config[key] > 1.0: + raise RuntimeError("{0} has invalid value {2}." + .format(self.layer_type, key, + self.config[key])) + if ((self.config['dropout-proportion'] > 1.0 or + self.config['dropout-proportion'] < 0.0) and + self.config['dropout-proportion'] != -1.0 ): + raise RuntimeError("dropout-proportion has invalid value {0}." + .format(self.config['dropout-proportion'])) + + def auxiliary_outputs(self): + return ['c_t'] + + def output_name(self, auxiliary_output = None): + node_name = 'y_t' + if auxiliary_output is not None: + if auxiliary_output in self.auxiliary_outputs(): + node_name = auxiliary_output + else: + raise Exception("In {0} of type {1}, unknown auxiliary output name {1}".format(self.layer_type, auxiliary_output)) + + return '{0}.{1}'.format(self.name, node_name) + + def output_dim(self, auxiliary_output = None): + if auxiliary_output is not None: + if auxiliary_output in self.auxiliary_outputs(): + if node_name == 'c_t': + return self.config['cell-dim'] + # add code for other auxiliary_outputs here when we decide to expose them + else: + raise Exception("In {0} of type {1}, unknown auxiliary output name {1}".format(self.layer_type, auxiliary_output)) + + return self.config['recurrent-projection-dim'] + self.config['non-recurrent-projection-dim'] + + def get_full_config(self): + ans = [] + config_lines = self.generate_pgru_config() + + for line in config_lines: + for config_name in ['ref', 'final']: + # we do not support user specified matrices in LSTM initialization + # so 'ref' and 'final' configs are the same. + ans.append((config_name, line)) + return ans + + # convenience function to generate the Norm-PGRU config + def generate_pgru_config(self): + + # assign some variables to reduce verbosity + name = self.name + # in the below code we will just call descriptor_strings as descriptors for conciseness + input_dim = self.descriptors['input']['dim'] + input_descriptor = self.descriptors['input']['final-string'] + cell_dim = self.config['cell-dim'] + rec_proj_dim = self.config['recurrent-projection-dim'] + nonrec_proj_dim = self.config['non-recurrent-projection-dim'] + delay = self.config['delay'] + repair_nonlin = self.config['self-repair-scale-nonlinearity'] + repair_nonlin_str = "self-repair-scale={0:.10f}".format(repair_nonlin) if repair_nonlin is not None else '' + bptrunc_str = ("clipping-threshold={0}" + " zeroing-threshold={1}" + " zeroing-interval={2}" + " recurrence-interval={3}" + "".format(self.config['clipping-threshold'], + self.config['zeroing-threshold'], + self.config['zeroing-interval'], + abs(delay))) + affine_str = self.config['ng-affine-options'] + pes_str = self.config['ng-per-element-scale-options'] + dropout_proportion = self.config['dropout-proportion'] + dropout_per_frame = 'true' if self.config['dropout-per-frame'] else 'false' + + # Natural gradient per element scale parameters + # TODO: decide if we want to keep exposing these options + if re.search('param-mean', pes_str) is None and \ + re.search('param-stddev', pes_str) is None: + pes_str += " param-mean=0.0 param-stddev=1.0 " + + # string for GruNonlinearityComponent + gru_nonlin_str = self.config['gru-nonlinearity-options'] + + # formulation like: + # z_t = \sigmoid ( U^z x_t + W^z s_{t-1} ) # update gate + # r_t = \sigmoid ( U^r x_t + W^r s_{t-1} ) # reset gate + # h_t = \tanh ( U^h x_t + W^h ( s_{t-1} \dot r_t ) ) + # c_t = ( 1 - z_t ) \dot h_t + z_t \dot c_{t-1} + # y_t_tmp = W^y c_t + # s_t = renorm ( y_t_tmp[0:rec_proj_dim-1] ) # dim(s_t) = recurrent_dim. + # y_t = batchnorm ( y_t_tmp ) # dim(y_t) = recurrent_dim + non_recurrent_dim. + # This is the output of the GRU. + # Note: + # naming convention: + # .W_. e.g. Gru1.W_i.xr for matrix + # providing output to gate i and operating on an appended vector [x,r] + # notation convention: + # In order to be consistent with the notations which are used in + # nnet-combined-component.cc, we map "\tilde{h_t}" and "h_t" which are + # used in paper to "h_t" and "c_t" + + configs = [] + configs.append("### Begin Gru layer '{0}'".format(name)) + configs.append("# Update gate control : W_z* matrices") + configs.append("component name={0}.W_z.xs type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + rec_proj_dim, cell_dim, affine_str)) + configs.append("# Reset gate control : W_r* matrices") + configs.append("component name={0}.W_r.xs type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + rec_proj_dim, rec_proj_dim, affine_str)) + + + configs.append("# hpart_t related matrix : W_hpart matric") + configs.append("component name={0}.W_hpart.x type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim, cell_dim , affine_str)) + + configs.append("# Defining the non-linearities") + configs.append("component name={0}.z type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str)) + configs.append("component name={0}.r type=SigmoidComponent dim={1} {2}".format(name, rec_proj_dim, repair_nonlin_str)) + + if dropout_proportion != -1.0: + configs.append("# Defining the dropout component") + configs.append("component name={0}.dropout_z type=DropoutComponent dim={1} " + "dropout-proportion={2} dropout-per-frame={3}" + .format(name, cell_dim, dropout_proportion, dropout_per_frame)) + configs.append("component name={0}.dropout_r type=DropoutComponent dim={1} " + "dropout-proportion={2} dropout-per-frame={3}" + .format(name, rec_proj_dim, dropout_proportion, dropout_per_frame)) + + + recurrent_connection = '{0}.s_t'.format(name) + + configs.append("# z_t") + configs.append("component-node name={0}.z_t_pre component={0}.W_z.xs input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay)) + if dropout_proportion != -1.0: + configs.append("component-node name={0}.z_t_predrop component={0}.z input={0}.z_t_pre".format(name)) + configs.append("component-node name={0}.z_t component={0}.dropout_z input={0}.z_t_predrop".format(name)) + else: + configs.append("component-node name={0}.z_t component={0}.z input={0}.z_t_pre".format(name)) + + configs.append("# r_t") + configs.append("component-node name={0}.r_t_pre component={0}.W_r.xs input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay)) + if dropout_proportion != -1.0: + configs.append("component-node name={0}.r_t_predrop component={0}.r input={0}.r_t_pre".format(name)) + configs.append("component-node name={0}.r_t component={0}.dropout_r input={0}.r_t_predrop".format(name)) + else: + configs.append("component-node name={0}.r_t component={0}.r input={0}.r_t_pre".format(name)) + + configs.append("# hpart_t") + configs.append("component-node name={0}.hpart_t component={0}.W_hpart.x input={1}".format(name, input_descriptor)) + + configs.append("# c_t") + configs.append("# Note: the output of GruNonlinearityComponent is (h_t, c_t), we use the second half.") + configs.append("component name={0}.gru_nonlin type=GruNonlinearityComponent cell-dim={1} recurrent-dim={2} {3}".format(name, cell_dim, rec_proj_dim, gru_nonlin_str)) + configs.append("component-node name={0}.gru_nonlin_t component={0}.gru_nonlin input=Append({0}.z_t, {0}.r_t, {0}.hpart_t, IfDefined(Offset({0}.c_t, {2})), IfDefined(Offset({1}, {2})))".format(name, recurrent_connection, delay)) + configs.append("dim-range-node name={0}.c_t input-node={0}.gru_nonlin_t dim-offset={1} dim={1}".format(name, cell_dim)) + + configs.append("# the projected matrix W_y.c and y_t_tmp") + configs.append("component name={0}.W_y.c type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, cell_dim, rec_proj_dim + nonrec_proj_dim, affine_str)) + configs.append("component-node name={0}.y_t_tmp component={0}.W_y.c input={0}.c_t".format(name)) + + configs.append("# s_t : recurrence") + configs.append("component name={0}.renorm type=NormalizeComponent dim={1} target-rms=1.0".format(name, rec_proj_dim)) + configs.append("component name={0}.s_r type=BackpropTruncationComponent dim={1} {2}".format(name, rec_proj_dim, bptrunc_str)) + configs.append("dim-range-node name={0}.s_t_pre input-node={0}.y_t_tmp dim-offset=0 dim={1}".format(name, rec_proj_dim)) + configs.append("component-node name={0}.s_t_renorm component={0}.renorm input={0}.s_t_pre".format(name)) + configs.append("component-node name={0}.s_t component={0}.s_r input={0}.s_t_renorm".format(name)) + + configs.append("# y_t : output") + configs.append("component name={0}.batchnorm type=BatchNormComponent dim={1} target-rms=1.0".format(name, rec_proj_dim + nonrec_proj_dim)) + configs.append("component-node name={0}.y_t component={0}.batchnorm input={0}.y_t_tmp".format(name)) + return configs + + +# This class is for lines like +# 'fast-opgru-layer name=opgru1 input=[-1] delay=-3' +# It generates an PGRU sub-graph with output projections. It can also generate +# outputs without projection, but you could use the XconfigGruLayer for this +# simple RNN. +# The output dimension of the layer may be specified via 'cell-dim=xxx', but if not specified, +# the dimension defaults to the same as the input. +# See other configuration values below. +# +# Parameters of the class, and their defaults: +# input='[-1]' [Descriptor giving the input of the layer.] +# cell-dim=-1 [Dimension of the cell] +# recurrent-projection_dim [Dimension of the projection used in recurrent connections, e.g. cell-dim/4] +# non-recurrent-projection-dim [Dimension of the projection in non-recurrent connections, +# in addition to recurrent-projection-dim, e.g. cell-dim/4] +# delay=-1 [Delay in the recurrent connections of the GRU ] +# clipping-threshold=30 [nnet3 GRU use a gradient clipping component at the recurrent connections. +# This is the threshold used to decide if clipping has to be activated ] +# zeroing-interval=20 [interval at which we (possibly) zero out the recurrent derivatives.] +# zeroing-threshold=15 [We only zero out the derivs every zeroing-interval, if derivs exceed this value.] +# self_repair_scale_nonlinearity=1e-5 [It is a constant scaling the self-repair vector computed in derived classes of NonlinearComponent] +# i.e., SigmoidComponent, TanhComponent and RectifiedLinearComponent ] +# ng-per-element-scale-options='' [Additional options used for the diagonal matrices in the GRU ] +# gru-nonlinearity-options=' max-change=0.75' [options for GruNonlinearityComponent, see below for detail] +# ng-affine-options='' [Additional options used for the full matrices in the GRU, can be used to do things like set biases to initialize to 1] +class XconfigFastOpgruLayer(XconfigLayerBase): + def __init__(self, first_token, key_to_value, prev_names = None): + assert first_token == "fast-opgru-layer" + XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) + + def set_default_configs(self): + self.config = {'input' : '[-1]', + 'cell-dim' : -1, # this is a compulsory argument + 'recurrent-projection-dim' : -1, # defaults to cell-dim / 4 + 'non-recurrent-projection-dim' : -1, # defaults to + # recurrent-projection-dim + 'clipping-threshold' : 30.0, + 'delay' : -1, + 'ng-per-element-scale-options' : ' max-change=0.75 ', + 'ng-affine-options' : ' max-change=0.75 ', + 'self-repair-scale-nonlinearity' : 0.00001, + 'zeroing-interval' : 20, + 'zeroing-threshold' : 15.0, + # if you want to set 'self-repair-scale', ' self-repair-threshold' + # or 'param-stddev' for GruNonlinearityComponent + # For default, they are 1.0e-05, 0.2 and 1.0 / sqrt(d) where d is cell-dim. + # you can add somethig like 'self-repair-scale=xxx' to gru-nonlinearity-options. + # you can also see src/nnet3/nnet-combined-component.h for detail + 'gru-nonlinearity-options' : ' max-change=0.75' + } + + def set_derived_configs(self): + if self.config['recurrent-projection-dim'] <= 0: + self.config['recurrent-projection-dim'] = self.config['cell-dim'] / 4 + + if self.config['non-recurrent-projection-dim'] <= 0: + self.config['non-recurrent-projection-dim'] = \ + self.config['recurrent-projection-dim'] + + def check_configs(self): + for key in ['cell-dim', 'recurrent-projection-dim', + 'non-recurrent-projection-dim']: + if self.config[key] <= 0: + raise RuntimeError("{0} has invalid value {1}.".format( + key, self.config[key])) + + if self.config['delay'] == 0: + raise RuntimeError("delay cannot be zero") + + if (self.config['recurrent-projection-dim'] + + self.config['non-recurrent-projection-dim'] > + self.config['cell-dim']): + raise RuntimeError("recurrent+non-recurrent projection dim exceeds " + "cell dim.") + for key in ['self-repair-scale-nonlinearity']: + if self.config[key] < 0.0 or self.config[key] > 1.0: + raise RuntimeError("{0} has invalid value {2}." + .format(self.layer_type, key, + self.config[key])) + + def auxiliary_outputs(self): + return ['c_t'] + + def output_name(self, auxiliary_output = None): + node_name = 'y_t' + if auxiliary_output is not None: + if auxiliary_output in self.auxiliary_outputs(): + node_name = auxiliary_output + else: + raise Exception("In {0} of type {1}, unknown auxiliary output name {1}".format(self.layer_type, auxiliary_output)) + + return '{0}.{1}'.format(self.name, node_name) + + def output_dim(self, auxiliary_output = None): + if auxiliary_output is not None: + if auxiliary_output in self.auxiliary_outputs(): + if node_name == 'c_t': + return self.config['cell-dim'] + # add code for other auxiliary_outputs here when we decide to expose them + else: + raise Exception("In {0} of type {1}, unknown auxiliary output name {1}".format(self.layer_type, auxiliary_output)) + + return self.config['recurrent-projection-dim'] + self.config['non-recurrent-projection-dim'] + + def get_full_config(self): + ans = [] + config_lines = self.generate_pgru_config() + + for line in config_lines: + for config_name in ['ref', 'final']: + # we do not support user specified matrices in LSTM initialization + # so 'ref' and 'final' configs are the same. + ans.append((config_name, line)) + return ans + + # convenience function to generate the OPGRU config + def generate_pgru_config(self): + + # assign some variables to reduce verbosity + name = self.name + # in the below code we will just call descriptor_strings as descriptors for conciseness + input_dim = self.descriptors['input']['dim'] + input_descriptor = self.descriptors['input']['final-string'] + cell_dim = self.config['cell-dim'] + rec_proj_dim = self.config['recurrent-projection-dim'] + nonrec_proj_dim = self.config['non-recurrent-projection-dim'] + delay = self.config['delay'] + repair_nonlin = self.config['self-repair-scale-nonlinearity'] + repair_nonlin_str = "self-repair-scale={0:.10f}".format(repair_nonlin) if repair_nonlin is not None else '' + bptrunc_str = ("clipping-threshold={0}" + " zeroing-threshold={1}" + " zeroing-interval={2}" + " recurrence-interval={3}" + "".format(self.config['clipping-threshold'], + self.config['zeroing-threshold'], + self.config['zeroing-interval'], + abs(delay))) + affine_str = self.config['ng-affine-options'] + pes_str = self.config['ng-per-element-scale-options'] + + # Natural gradient per element scale parameters + # TODO: decide if we want to keep exposing these options + if re.search('param-mean', pes_str) is None and \ + re.search('param-stddev', pes_str) is None: + pes_str += " param-mean=0.0 param-stddev=1.0 " + + # string for GruNonlinearityComponent + gru_nonlin_str = self.config['gru-nonlinearity-options'] + + # formulation like: + # z_t = \sigmoid ( U^z x_t + W^z s_{t-1} ) # update gate + # o_t = \sigmoid ( U^o x_t + W^o s_{t-1} ) # reset gate + # h_t = \tanh ( U^h x_t + W^h \dot c_{t-1} ) + # c_t = ( 1 - z_t ) \dot h_t + z_t \dot c_{t-1} + # y_t = ( c_t \dot o_t ) W^y # dim(y_t) = recurrent_dim + non_recurrent_dim. + # This is the output of the GRU. + # s_t = y_t[0:recurrent_dim-1] # dimension range of y_t + # dim(s_t) = recurrent_dim. + # Note: + # naming convention: + # .W_. e.g. Gru1.W_i.xr for matrix + # providing output to gate i and operating on an appended vector [x,r] + # notation convention: + # In order to be consistent with the notations which are used in + # nnet-combined-component.cc, we map "\tilde{h_t}" and "h_t" which are + # used in paper to "h_t" and "c_t" + + configs = [] + configs.append("### Begin Gru layer '{0}'".format(name)) + configs.append("# Update gate control : W_z* matrices") + configs.append("component name={0}.W_z.xs type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + rec_proj_dim, cell_dim, affine_str)) + configs.append("# Reset gate control : W_o* matrices") + configs.append("component name={0}.W_o.xs type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + rec_proj_dim, cell_dim, affine_str)) + + + configs.append("# hpart_t related matrix : W_hpart matric") + configs.append("component name={0}.W_hpart.x type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim, cell_dim , affine_str)) + + configs.append("# Defining the non-linearities") + configs.append("component name={0}.z type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str)) + configs.append("component name={0}.o type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str)) + + recurrent_connection = '{0}.s_t'.format(name) + + configs.append("# z_t and o_t") + configs.append("component-node name={0}.z_t_pre component={0}.W_z.xs input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay)) + configs.append("component-node name={0}.z_t component={0}.z input={0}.z_t_pre".format(name)) + configs.append("component-node name={0}.o_t_pre component={0}.W_o.xs input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay)) + configs.append("component-node name={0}.o_t component={0}.o input={0}.o_t_pre".format(name)) + + configs.append("# hpart_t") + configs.append("component-node name={0}.hpart_t component={0}.W_hpart.x input={1}".format(name, input_descriptor)) + + configs.append("# c_t") + configs.append("# Note: the output of OutputGruNonlinearityComponent is (h_t, c_t), we use the second half.") + configs.append("component name={0}.gru_nonlin type=OutputGruNonlinearityComponent cell-dim={1} {2}".format(name, cell_dim, gru_nonlin_str)) + configs.append("component-node name={0}.gru_nonlin_t component={0}.gru_nonlin input=Append({0}.z_t, {0}.hpart_t, IfDefined(Offset({0}.c_t, {1})))".format(name, delay)) + configs.append("dim-range-node name={0}.c_t input-node={0}.gru_nonlin_t dim-offset={1} dim={1}".format(name, cell_dim)) + + configs.append("# the projected matrix W_y.cdoto and y_t") + configs.append("component name={0}.cdoto type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim)) + configs.append("component-node name={0}.cdoto component={0}.cdoto input=Append({0}.c_t, {0}.o_t)".format(name)) + configs.append("component name={0}.W_y.cdoto type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, cell_dim, rec_proj_dim + nonrec_proj_dim, affine_str)) + configs.append("component-node name={0}.y_t component={0}.W_y.cdoto input={0}.cdoto".format(name)) + + configs.append("# s_t recurrence") + configs.append("component name={0}.s_r type=BackpropTruncationComponent dim={1} {2}".format(name, rec_proj_dim, bptrunc_str)) + configs.append("dim-range-node name={0}.s_t_preclip input-node={0}.y_t dim-offset=0 dim={1}".format(name, rec_proj_dim)) + configs.append("component-node name={0}.s_t component={0}.s_r input={0}.s_t_preclip".format(name)) + + return configs + + +# This class is for lines like +# 'fast-norm-opgru-layer name=opgru1 input=[-1] delay=-3' + +# Different from the vanilla OPGRU, the NormOPGRU uses batchnorm in the forward direction +# and renorm in the recurrence. + +# The output dimension of the layer may be specified via 'cell-dim=xxx', but if not specified, +# the dimension defaults to the same as the input. +# See other configuration values below. +# +# Parameters of the class, and their defaults: +# input='[-1]' [Descriptor giving the input of the layer.] +# cell-dim=-1 [Dimension of the cell] +# recurrent-projection_dim [Dimension of the projection used in recurrent connections, e.g. cell-dim/4] +# non-recurrent-projection-dim [Dimension of the projection in non-recurrent connections, +# in addition to recurrent-projection-dim, e.g. cell-dim/4] +# delay=-1 [Delay in the recurrent connections of the GRU ] +# clipping-threshold=30 [nnet3 GRU use a gradient clipping component at the recurrent connections. +# This is the threshold used to decide if clipping has to be activated ] +# zeroing-interval=20 [interval at which we (possibly) zero out the recurrent derivatives.] +# zeroing-threshold=15 [We only zero out the derivs every zeroing-interval, if derivs exceed this value.] +# self_repair_scale_nonlinearity=1e-5 [It is a constant scaling the self-repair vector computed in derived classes of NonlinearComponent] +# i.e., SigmoidComponent, TanhComponent and RectifiedLinearComponent ] +# ng-per-element-scale-options='' [Additional options used for the diagonal matrices in the GRU ] +# gru-nonlinearity-options=' max-change=0.75' [options for GruNonlinearityComponent, see below for detail] +# ng-affine-options='' [Additional options used for the full matrices in the GRU, can be used to do things like set biases to initialize to 1] +class XconfigFastNormOpgruLayer(XconfigLayerBase): + def __init__(self, first_token, key_to_value, prev_names = None): + assert first_token == "fast-norm-opgru-layer" + XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) + + def set_default_configs(self): + self.config = {'input' : '[-1]', + 'cell-dim' : -1, # this is a compulsory argument + 'recurrent-projection-dim' : -1, # defaults to cell-dim / 4 + 'non-recurrent-projection-dim' : -1, # defaults to + # recurrent-projection-dim + 'clipping-threshold' : 30.0, + 'delay' : -1, + 'ng-per-element-scale-options' : ' max-change=0.75 ', + 'ng-affine-options' : ' max-change=0.75 ', + 'self-repair-scale-nonlinearity' : 0.00001, + 'zeroing-interval' : 20, + 'zeroing-threshold' : 15.0, + # if you want to set 'self-repair-scale', ' self-repair-threshold' + # or 'param-stddev' for GruNonlinearityComponent + # For default, they are 1.0e-05, 0.2 and 1.0 / sqrt(d) where d is cell-dim. + # you can add somethig like 'self-repair-scale=xxx' to gru-nonlinearity-options. + # you can also see src/nnet3/nnet-combined-component.h for detail + 'gru-nonlinearity-options' : ' max-change=0.75', + 'dropout-proportion' : -1.0, # If -1.0, no dropout components will be added + 'dropout-per-frame' : True # If False, regular dropout, not per frame + } + + def set_derived_configs(self): + if self.config['recurrent-projection-dim'] <= 0: + self.config['recurrent-projection-dim'] = self.config['cell-dim'] / 4 + + if self.config['non-recurrent-projection-dim'] <= 0: + self.config['non-recurrent-projection-dim'] = \ + self.config['recurrent-projection-dim'] + + def check_configs(self): + for key in ['cell-dim', 'recurrent-projection-dim', + 'non-recurrent-projection-dim']: + if self.config[key] <= 0: + raise RuntimeError("{0} has invalid value {1}.".format( + key, self.config[key])) + + if self.config['delay'] == 0: + raise RuntimeError("delay cannot be zero") + + if (self.config['recurrent-projection-dim'] + + self.config['non-recurrent-projection-dim'] > + self.config['cell-dim']): + raise RuntimeError("recurrent+non-recurrent projection dim exceeds " + "cell dim.") + for key in ['self-repair-scale-nonlinearity']: + if self.config[key] < 0.0 or self.config[key] > 1.0: + raise RuntimeError("{0} has invalid value {2}." + .format(self.layer_type, key, + self.config[key])) + if ((self.config['dropout-proportion'] > 1.0 or + self.config['dropout-proportion'] < 0.0) and + self.config['dropout-proportion'] != -1.0 ): + raise RuntimeError("dropout-proportion has invalid value {0}." + .format(self.config['dropout-proportion'])) + + def auxiliary_outputs(self): + return ['c_t'] + + def output_name(self, auxiliary_output = None): + node_name = 'y_t' + if auxiliary_output is not None: + if auxiliary_output in self.auxiliary_outputs(): + node_name = auxiliary_output + else: + raise Exception("In {0} of type {1}, unknown auxiliary output name {1}".format(self.layer_type, auxiliary_output)) + + return '{0}.{1}'.format(self.name, node_name) + + def output_dim(self, auxiliary_output = None): + if auxiliary_output is not None: + if auxiliary_output in self.auxiliary_outputs(): + if node_name == 'c_t': + return self.config['cell-dim'] + # add code for other auxiliary_outputs here when we decide to expose them + else: + raise Exception("In {0} of type {1}, unknown auxiliary output name {1}".format(self.layer_type, auxiliary_output)) + + return self.config['recurrent-projection-dim'] + self.config['non-recurrent-projection-dim'] + + def get_full_config(self): + ans = [] + config_lines = self.generate_pgru_config() + + for line in config_lines: + for config_name in ['ref', 'final']: + # we do not support user specified matrices in LSTM initialization + # so 'ref' and 'final' configs are the same. + ans.append((config_name, line)) + return ans + + # convenience function to generate the Norm-OPGRU config + def generate_pgru_config(self): + + # assign some variables to reduce verbosity + name = self.name + # in the below code we will just call descriptor_strings as descriptors for conciseness + input_dim = self.descriptors['input']['dim'] + input_descriptor = self.descriptors['input']['final-string'] + cell_dim = self.config['cell-dim'] + rec_proj_dim = self.config['recurrent-projection-dim'] + nonrec_proj_dim = self.config['non-recurrent-projection-dim'] + delay = self.config['delay'] + repair_nonlin = self.config['self-repair-scale-nonlinearity'] + repair_nonlin_str = "self-repair-scale={0:.10f}".format(repair_nonlin) if repair_nonlin is not None else '' + bptrunc_str = ("clipping-threshold={0}" + " zeroing-threshold={1}" + " zeroing-interval={2}" + " recurrence-interval={3}" + "".format(self.config['clipping-threshold'], + self.config['zeroing-threshold'], + self.config['zeroing-interval'], + abs(delay))) + affine_str = self.config['ng-affine-options'] + pes_str = self.config['ng-per-element-scale-options'] + dropout_proportion = self.config['dropout-proportion'] + dropout_per_frame = 'true' if self.config['dropout-per-frame'] else 'false' + + # Natural gradient per element scale parameters + # TODO: decide if we want to keep exposing these options + if re.search('param-mean', pes_str) is None and \ + re.search('param-stddev', pes_str) is None: + pes_str += " param-mean=0.0 param-stddev=1.0 " + + # string for GruNonlinearityComponent + gru_nonlin_str = self.config['gru-nonlinearity-options'] + + # formulation like: + # z_t = \sigmoid ( U^z x_t + W^z s_{t-1} ) # update gate + # o_t = \sigmoid ( U^o x_t + W^o s_{t-1} ) # output gate + # h_t = \tanh ( U^h x_t + W^h \dot c_{t-1} ) + # c_t = ( 1 - z_t ) \dot h_t + z_t \dot c_{t-1} + # y_t_tmp = ( c_t \dot o_t ) W^y + # s_t = renorm ( y_t_tmp[0:rec_proj_dim-1] ) # dim(s_t) = recurrent_dim. + # y_t = batchnorm ( y_t_tmp ) # dim(y_t) = recurrent_dim + non_recurrent_dim. + # This is the output of the GRU. + # Note: + # naming convention: + # .W_. e.g. Gru1.W_i.xr for matrix + # providing output to gate i and operating on an appended vector [x,r] + # notation convention: + # In order to be consistent with the notations which are used in + # nnet-combined-component.cc, we map "\tilde{h_t}" and "h_t" which are + # used in paper to "h_t" and "c_t" + + configs = [] + configs.append("### Begin Gru layer '{0}'".format(name)) + configs.append("# Update gate control : W_z* matrices") + configs.append("component name={0}.W_z.xs type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + rec_proj_dim, cell_dim, affine_str)) + configs.append("# Reset gate control : W_o* matrices") + configs.append("component name={0}.W_o.xs type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + rec_proj_dim, cell_dim, affine_str)) + + + configs.append("# hpart_t related matrix : W_hpart matric") + configs.append("component name={0}.W_hpart.x type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim, cell_dim , affine_str)) + + configs.append("# Defining the non-linearities") + configs.append("component name={0}.z type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str)) + configs.append("component name={0}.o type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str)) + + if dropout_proportion != -1.0: + configs.append("# Defining the dropout component") + configs.append("component name={0}.dropout type=DropoutComponent dim={1} " + "dropout-proportion={2} dropout-per-frame={3}" + .format(name, cell_dim, dropout_proportion, dropout_per_frame)) + + recurrent_connection = '{0}.s_t'.format(name) + + configs.append("# z_t") + configs.append("component-node name={0}.z_t_pre component={0}.W_z.xs input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay)) + if dropout_proportion != -1.0: + configs.append("component-node name={0}.z_t_predrop component={0}.z input={0}.z_t_pre".format(name)) + configs.append("component-node name={0}.z_t component={0}.dropout input={0}.z_t_predrop".format(name)) + else: + configs.append("component-node name={0}.z_t component={0}.z input={0}.z_t_pre".format(name)) + + configs.append("# o_t") + configs.append("component-node name={0}.o_t_pre component={0}.W_o.xs input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay)) + if dropout_proportion != -1.0: + configs.append("component-node name={0}.o_t_predrop component={0}.o input={0}.o_t_pre".format(name)) + configs.append("component-node name={0}.o_t component={0}.dropout input={0}.o_t_predrop".format(name)) + else: + configs.append("component-node name={0}.o_t component={0}.o input={0}.o_t_pre".format(name)) + + configs.append("# hpart_t") + configs.append("component-node name={0}.hpart_t component={0}.W_hpart.x input={1}".format(name, input_descriptor)) + + configs.append("# c_t") + configs.append("# Note: the output of OutputGruNonlinearityComponent is (h_t, c_t), we use the second half.") + configs.append("component name={0}.gru_nonlin type=OutputGruNonlinearityComponent cell-dim={1} {2}".format(name, cell_dim, gru_nonlin_str)) + configs.append("component-node name={0}.gru_nonlin_t component={0}.gru_nonlin input=Append({0}.z_t, {0}.hpart_t, IfDefined(Offset({0}.c_t, {1})))".format(name, delay)) + configs.append("dim-range-node name={0}.c_t input-node={0}.gru_nonlin_t dim-offset={1} dim={1}".format(name, cell_dim)) + + configs.append("# the projected matrix W_y.cdoto and y_t_tmp") + configs.append("component name={0}.cdoto type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim)) + configs.append("component-node name={0}.cdoto component={0}.cdoto input=Append({0}.c_t, {0}.o_t)".format(name)) + configs.append("component name={0}.W_y.cdoto type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, cell_dim, rec_proj_dim + nonrec_proj_dim, affine_str)) + configs.append("component-node name={0}.y_t_tmp component={0}.W_y.cdoto input={0}.cdoto".format(name)) + + configs.append("# s_t : recurrence") + configs.append("component name={0}.renorm type=NormalizeComponent dim={1} target-rms=1.0".format(name, rec_proj_dim)) + configs.append("component name={0}.s_r type=BackpropTruncationComponent dim={1} {2}".format(name, rec_proj_dim, bptrunc_str)) + configs.append("dim-range-node name={0}.s_t_pre input-node={0}.y_t_tmp dim-offset=0 dim={1}".format(name, rec_proj_dim)) + configs.append("component-node name={0}.s_t_renorm component={0}.renorm input={0}.s_t_pre".format(name)) + configs.append("component-node name={0}.s_t component={0}.s_r input={0}.s_t_renorm".format(name)) + + configs.append("# y_t : output") + configs.append("component name={0}.batchnorm type=BatchNormComponent dim={1} target-rms=1.0".format(name, rec_proj_dim + nonrec_proj_dim)) + configs.append("component-node name={0}.y_t component={0}.batchnorm input={0}.y_t_tmp".format(name)) + + return configs diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py index 01c1b1e533c..ca1f7d8372f 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py @@ -68,6 +68,11 @@ 'opgru-layer' : xlayers.XconfigOpgruLayer, 'norm-pgru-layer' : xlayers.XconfigNormPgruLayer, 'norm-opgru-layer' : xlayers.XconfigNormOpgruLayer, + 'fast-gru-layer' : xlayers.XconfigFastGruLayer, + 'fast-pgru-layer' : xlayers.XconfigFastPgruLayer, + 'fast-norm-pgru-layer' : xlayers.XconfigFastNormPgruLayer, + 'fast-opgru-layer' : xlayers.XconfigFastOpgruLayer, + 'fast-norm-opgru-layer' : xlayers.XconfigFastNormOpgruLayer, 'tdnnf-layer': xlayers.XconfigTdnnfLayer, 'prefinal-layer': xlayers.XconfigPrefinalLayer, 'renorm-component': xlayers.XconfigRenormComponent, diff --git a/egs/wsj/s5/steps/nnet3/get_saturation.pl b/egs/wsj/s5/steps/nnet3/get_saturation.pl index ed18fc1c399..979736f0847 100755 --- a/egs/wsj/s5/steps/nnet3/get_saturation.pl +++ b/egs/wsj/s5/steps/nnet3/get_saturation.pl @@ -74,6 +74,14 @@ if (! $ok) { print STDERR "Could not parse at least one of the avg-deriv values in the following info line: $_"; } + } elsif (m/type=.*GruNonlinearityComponent/) { + if (m/deriv-avg=[^m]+mean=([^,]+),/) { + $num_nonlinearities += 1; + my $this_saturation = 1.0 - ($1 / 1.0); + $total_saturation += $this_saturation; + } else { + print STDERR "$0: could not make sense of line (no deriv-avg?): $_"; + } } } diff --git a/src/nnet3/Makefile b/src/nnet3/Makefile index 135853cadc3..6214592303b 100644 --- a/src/nnet3/Makefile +++ b/src/nnet3/Makefile @@ -15,7 +15,7 @@ TESTFILES = natural-gradient-online-test nnet-graph-test \ nnet-common-test convolution-test attention-test OBJFILES = nnet-common.o nnet-compile.o nnet-component-itf.o \ - nnet-simple-component.o nnet-normalize-component.o \ + nnet-simple-component.o nnet-combined-component.o nnet-normalize-component.o \ nnet-general-component.o nnet-parse.o natural-gradient-online.o \ nnet-descriptor.o nnet-optimize.o nnet-computation.o \ nnet-computation-graph.o nnet-graph.o am-nnet-simple.o \ diff --git a/src/nnet3/nnet-combined-component.cc b/src/nnet3/nnet-combined-component.cc new file mode 100644 index 00000000000..0a2fb3f5a91 --- /dev/null +++ b/src/nnet3/nnet-combined-component.cc @@ -0,0 +1,2332 @@ +// nnet3/nnet-combined-component.cc + +// Copyright 2015-2018 Johns Hopkins University (author: Daniel Povey) +// 2015 Daniel Galvez +// 2018 Hang Lyu + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include "nnet3/nnet-combined-component.h" +#include "nnet3/nnet-parse.h" +#include "cudamatrix/cu-math.h" + +namespace kaldi { +namespace nnet3 { + +// Constructors for the convolution component +ConvolutionComponent::ConvolutionComponent(): + UpdatableComponent(), + input_x_dim_(0), input_y_dim_(0), input_z_dim_(0), + filt_x_dim_(0), filt_y_dim_(0), + filt_x_step_(0), filt_y_step_(0), + input_vectorization_(kZyx) { } + +ConvolutionComponent::ConvolutionComponent( + const ConvolutionComponent &component): + UpdatableComponent(component), + input_x_dim_(component.input_x_dim_), + input_y_dim_(component.input_y_dim_), + input_z_dim_(component.input_z_dim_), + filt_x_dim_(component.filt_x_dim_), + filt_y_dim_(component.filt_y_dim_), + filt_x_step_(component.filt_x_step_), + filt_y_step_(component.filt_y_step_), + input_vectorization_(component.input_vectorization_), + filter_params_(component.filter_params_), + bias_params_(component.bias_params_) { } + +ConvolutionComponent::ConvolutionComponent( + const CuMatrixBase &filter_params, + const CuVectorBase &bias_params, + int32 input_x_dim, int32 input_y_dim, int32 input_z_dim, + int32 filt_x_dim, int32 filt_y_dim, + int32 filt_x_step, int32 filt_y_step, + TensorVectorizationType input_vectorization, + BaseFloat learning_rate): + input_x_dim_(input_x_dim), + input_y_dim_(input_y_dim), + input_z_dim_(input_z_dim), + filt_x_dim_(filt_x_dim), + filt_y_dim_(filt_y_dim), + filt_x_step_(filt_x_step), + filt_y_step_(filt_y_step), + input_vectorization_(input_vectorization), + filter_params_(filter_params), + bias_params_(bias_params){ + KALDI_ASSERT(filter_params.NumRows() == bias_params.Dim() && + bias_params.Dim() != 0); + KALDI_ASSERT(filter_params.NumCols() == filt_x_dim * filt_y_dim * input_z_dim); + SetUnderlyingLearningRate(learning_rate); + is_gradient_ = false; +} + +// aquire input dim +int32 ConvolutionComponent::InputDim() const { + return input_x_dim_ * input_y_dim_ * input_z_dim_; +} + +// aquire output dim +int32 ConvolutionComponent::OutputDim() const { + int32 num_x_steps = (1 + (input_x_dim_ - filt_x_dim_) / filt_x_step_); + int32 num_y_steps = (1 + (input_y_dim_ - filt_y_dim_) / filt_y_step_); + int32 num_filters = filter_params_.NumRows(); + return num_x_steps * num_y_steps * num_filters; +} + +// initialize the component using hyperparameters +void ConvolutionComponent::Init( + int32 input_x_dim, int32 input_y_dim, int32 input_z_dim, + int32 filt_x_dim, int32 filt_y_dim, + int32 filt_x_step, int32 filt_y_step, int32 num_filters, + TensorVectorizationType input_vectorization, + BaseFloat param_stddev, BaseFloat bias_stddev) { + input_x_dim_ = input_x_dim; + input_y_dim_ = input_y_dim; + input_z_dim_ = input_z_dim; + filt_x_dim_ = filt_x_dim; + filt_y_dim_ = filt_y_dim; + filt_x_step_ = filt_x_step; + filt_y_step_ = filt_y_step; + input_vectorization_ = input_vectorization; + KALDI_ASSERT((input_x_dim_ - filt_x_dim_) % filt_x_step_ == 0); + KALDI_ASSERT((input_y_dim_ - filt_y_dim_) % filt_y_step_ == 0); + int32 filter_dim = filt_x_dim_ * filt_y_dim_ * input_z_dim_; + filter_params_.Resize(num_filters, filter_dim); + bias_params_.Resize(num_filters); + KALDI_ASSERT(param_stddev >= 0.0 && bias_stddev >= 0.0); + filter_params_.SetRandn(); + filter_params_.Scale(param_stddev); + bias_params_.SetRandn(); + bias_params_.Scale(bias_stddev); +} + +// initialize the component using predefined matrix file +void ConvolutionComponent::Init( + int32 input_x_dim, int32 input_y_dim, int32 input_z_dim, + int32 filt_x_dim, int32 filt_y_dim, + int32 filt_x_step, int32 filt_y_step, + TensorVectorizationType input_vectorization, + std::string matrix_filename) { + input_x_dim_ = input_x_dim; + input_y_dim_ = input_y_dim; + input_z_dim_ = input_z_dim; + filt_x_dim_ = filt_x_dim; + filt_y_dim_ = filt_y_dim; + filt_x_step_ = filt_x_step; + filt_y_step_ = filt_y_step; + input_vectorization_ = input_vectorization; + CuMatrix mat; + ReadKaldiObject(matrix_filename, &mat); + int32 filter_dim = (filt_x_dim_ * filt_y_dim_ * input_z_dim_); + int32 num_filters = mat.NumRows(); + KALDI_ASSERT(mat.NumCols() == (filter_dim + 1)); + filter_params_.Resize(num_filters, filter_dim); + bias_params_.Resize(num_filters); + filter_params_.CopyFromMat(mat.Range(0, num_filters, 0, filter_dim)); + bias_params_.CopyColFromMat(mat, filter_dim); +} + +// display information about component +std::string ConvolutionComponent::Info() const { + std::ostringstream stream; + stream << UpdatableComponent::Info() + << ", input-x-dim=" << input_x_dim_ + << ", input-y-dim=" << input_y_dim_ + << ", input-z-dim=" << input_z_dim_ + << ", filt-x-dim=" << filt_x_dim_ + << ", filt-y-dim=" << filt_y_dim_ + << ", filt-x-step=" << filt_x_step_ + << ", filt-y-step=" << filt_y_step_ + << ", input-vectorization=" << input_vectorization_ + << ", num-filters=" << filter_params_.NumRows(); + PrintParameterStats(stream, "filter-params", filter_params_); + PrintParameterStats(stream, "bias-params", bias_params_, true); + return stream.str(); +} + +// initialize the component using configuration file +void ConvolutionComponent::InitFromConfig(ConfigLine *cfl) { + bool ok = true; + std::string matrix_filename; + int32 input_x_dim = -1, input_y_dim = -1, input_z_dim = -1, + filt_x_dim = -1, filt_y_dim = -1, + filt_x_step = -1, filt_y_step = -1, + num_filters = -1; + std::string input_vectorization_order = "zyx"; + InitLearningRatesFromConfig(cfl); + ok = ok && cfl->GetValue("input-x-dim", &input_x_dim); + ok = ok && cfl->GetValue("input-y-dim", &input_y_dim); + ok = ok && cfl->GetValue("input-z-dim", &input_z_dim); + ok = ok && cfl->GetValue("filt-x-dim", &filt_x_dim); + ok = ok && cfl->GetValue("filt-y-dim", &filt_y_dim); + ok = ok && cfl->GetValue("filt-x-step", &filt_x_step); + ok = ok && cfl->GetValue("filt-y-step", &filt_y_step); + + if (!ok) + KALDI_ERR << "Bad initializer " << cfl->WholeLine(); + // optional argument + TensorVectorizationType input_vectorization; + cfl->GetValue("input-vectorization-order", &input_vectorization_order); + if (input_vectorization_order.compare("zyx") == 0) { + input_vectorization = kZyx; + } else if (input_vectorization_order.compare("yzx") == 0) { + input_vectorization = kYzx; + } else { + KALDI_ERR << "Unknown or unsupported input vectorization order " + << input_vectorization_order + << " accepted candidates are 'yzx' and 'zyx'"; + } + + if (cfl->GetValue("matrix", &matrix_filename)) { + // initialize from prefined parameter matrix + Init(input_x_dim, input_y_dim, input_z_dim, + filt_x_dim, filt_y_dim, + filt_x_step, filt_y_step, + input_vectorization, + matrix_filename); + } else { + ok = ok && cfl->GetValue("num-filters", &num_filters); + if (!ok) + KALDI_ERR << "Bad initializer " << cfl->WholeLine(); + // initialize from configuration + int32 filter_input_dim = filt_x_dim * filt_y_dim * input_z_dim; + BaseFloat param_stddev = 1.0 / std::sqrt(filter_input_dim), bias_stddev = 1.0; + cfl->GetValue("param-stddev", ¶m_stddev); + cfl->GetValue("bias-stddev", &bias_stddev); + Init(input_x_dim, input_y_dim, input_z_dim, + filt_x_dim, filt_y_dim, filt_x_step, filt_y_step, num_filters, + input_vectorization, param_stddev, bias_stddev); + } + if (cfl->HasUnusedValues()) + KALDI_ERR << "Could not process these elements in initializer: " + << cfl->UnusedValues(); + if (!ok) + KALDI_ERR << "Bad initializer " << cfl->WholeLine(); +} + +// Inline methods to convert from tensor index i.e., (x,y,z) index +// to index in yzx or zyx vectorized tensors +inline int32 YzxVectorIndex(int32 x, int32 y, int32 z, + int32 input_x_dim, + int32 input_y_dim, + int32 input_z_dim) { + KALDI_PARANOID_ASSERT(x < input_x_dim && y < input_y_dim && z < input_z_dim); + return (input_y_dim * input_z_dim) * x + (input_y_dim) * z + y; +} + +inline int32 ZyxVectorIndex(int32 x, int32 y, int32 z, + int32 input_x_dim, + int32 input_y_dim, + int32 input_z_dim) { + KALDI_PARANOID_ASSERT(x < input_x_dim && y < input_y_dim && z < input_z_dim); + return (input_y_dim * input_z_dim) * x + (input_z_dim) * y + z; +} + +// Method to convert from a matrix representing a minibatch of vectorized +// 3D tensors to patches for convolution, each patch corresponds to +// one dot product in the convolution +void ConvolutionComponent::InputToInputPatches( + const CuMatrixBase& in, + CuMatrix *patches) const{ + int32 num_x_steps = (1 + (input_x_dim_ - filt_x_dim_) / filt_x_step_); + int32 num_y_steps = (1 + (input_y_dim_ - filt_y_dim_) / filt_y_step_); + const int32 filt_x_step = filt_x_step_, + filt_y_step = filt_y_step_, + filt_x_dim = filt_x_dim_, + filt_y_dim = filt_y_dim_, + input_x_dim = input_x_dim_, + input_y_dim = input_y_dim_, + input_z_dim = input_z_dim_, + filter_dim = filter_params_.NumCols(); + + std::vector column_map(patches->NumCols()); + int32 column_map_size = column_map.size(); + for (int32 x_step = 0; x_step < num_x_steps; x_step++) { + for (int32 y_step = 0; y_step < num_y_steps; y_step++) { + int32 patch_number = x_step * num_y_steps + y_step; + int32 patch_start_index = patch_number * filter_dim; + for (int32 x = 0, index = patch_start_index; x < filt_x_dim; x++) { + for (int32 y = 0; y < filt_y_dim; y++) { + for (int32 z = 0; z < input_z_dim; z++, index++) { + KALDI_ASSERT(index < column_map_size); + if (input_vectorization_ == kZyx) { + column_map[index] = ZyxVectorIndex(x_step * filt_x_step + x, + y_step * filt_y_step + y, z, + input_x_dim, input_y_dim, + input_z_dim); + } else if (input_vectorization_ == kYzx) { + column_map[index] = YzxVectorIndex(x_step * filt_x_step + x, + y_step * filt_y_step + y, z, + input_x_dim, input_y_dim, + input_z_dim); + } + } + } + } + } + } + CuArray cu_cols(column_map); + patches->CopyCols(in, cu_cols); +} + + +// propagation function +// see function declaration in nnet-simple-component.h for details +void* ConvolutionComponent::Propagate(const ComponentPrecomputedIndexes *indexes, + const CuMatrixBase &in, + CuMatrixBase *out) const { + const int32 num_x_steps = (1 + (input_x_dim_ - filt_x_dim_) / filt_x_step_), + num_y_steps = (1 + (input_y_dim_ - filt_y_dim_) / filt_y_step_), + num_filters = filter_params_.NumRows(), + num_frames = in.NumRows(), + filter_dim = filter_params_.NumCols(); + KALDI_ASSERT((*out).NumRows() == num_frames && + (*out).NumCols() == (num_filters * num_x_steps * num_y_steps)); + + CuMatrix patches(num_frames, + num_x_steps * num_y_steps * filter_dim, + kUndefined); + InputToInputPatches(in, &patches); + CuSubMatrix* filter_params_elem = new CuSubMatrix( + filter_params_, 0, filter_params_.NumRows(), 0, filter_params_.NumCols()); + std::vector* > tgt_batch, patch_batch, + filter_params_batch; + + for (int32 x_step = 0; x_step < num_x_steps; x_step++) { + for (int32 y_step = 0; y_step < num_y_steps; y_step++) { + int32 patch_number = x_step * num_y_steps + y_step; + tgt_batch.push_back(new CuSubMatrix( + out->ColRange(patch_number * num_filters, num_filters))); + patch_batch.push_back(new CuSubMatrix( + patches.ColRange(patch_number * filter_dim, filter_dim))); + filter_params_batch.push_back(filter_params_elem); + tgt_batch[patch_number]->AddVecToRows(1.0, bias_params_, 1.0); // add bias + } + } + // apply all filters + AddMatMatBatched(1.0, tgt_batch, patch_batch, + kNoTrans, filter_params_batch, + kTrans, 1.0); + // release memory + delete filter_params_elem; + for (int32 p = 0; p < tgt_batch.size(); p++) { + delete tgt_batch[p]; + delete patch_batch[p]; + } + return NULL; +} + +// scale the parameters +void ConvolutionComponent::Scale(BaseFloat scale) { + if (scale == 0.0) { + filter_params_.SetZero(); + bias_params_.SetZero(); + } else { + filter_params_.Scale(scale); + bias_params_.Scale(scale); + } +} + +// add another convolution component +void ConvolutionComponent::Add(BaseFloat alpha, const Component &other_in) { + const ConvolutionComponent *other = + dynamic_cast(&other_in); + KALDI_ASSERT(other != NULL); + filter_params_.AddMat(alpha, other->filter_params_); + bias_params_.AddVec(alpha, other->bias_params_); +} + +/* + This function transforms a vector of lists into a list of vectors, + padded with -1. + @param[in] The input vector of lists. Let in.size() be D, and let + the longest list length (i.e. the max of in[i].size()) be L. + @param[out] The output list of vectors. The length of the list will + be L, each vector-dimension will be D (i.e. out[i].size() == D), + and if in[i] == j, then for some k we will have that + out[k][j] = i. The output vectors are padded with -1 + where necessary if not all the input lists have the same side. +*/ +void RearrangeIndexes(const std::vector > &in, + std::vector > *out) { + int32 D = in.size(); + int32 L = 0; + for (int32 i = 0; i < D; i++) + if (in[i].size() > L) + L = in[i].size(); + out->resize(L); + for (int32 i = 0; i < L; i++) + (*out)[i].resize(D, -1); + for (int32 i = 0; i < D; i++) { + for (int32 j = 0; j < in[i].size(); j++) { + (*out)[j][i] = in[i][j]; + } + } +} + +// Method to compute the input derivative matrix from the input derivatives +// for patches, where each patch corresponds to one dot product +// in the convolution +void ConvolutionComponent::InderivPatchesToInderiv( + const CuMatrix& in_deriv_patches, + CuMatrixBase *in_deriv) const { + + const int32 num_x_steps = (1 + (input_x_dim_ - filt_x_dim_) / filt_x_step_), + num_y_steps = (1 + (input_y_dim_ - filt_y_dim_) / filt_y_step_), + filt_x_step = filt_x_step_, + filt_y_step = filt_y_step_, + filt_x_dim = filt_x_dim_, + filt_y_dim = filt_y_dim_, + input_x_dim = input_x_dim_, + input_y_dim = input_y_dim_, + input_z_dim = input_z_dim_, + filter_dim = filter_params_.NumCols(); + + // Compute the reverse column_map from the matrix with input + // derivative patches to input derivative matrix + std::vector > reverse_column_map(in_deriv->NumCols()); + int32 rev_col_map_size = reverse_column_map.size(); + for (int32 x_step = 0; x_step < num_x_steps; x_step++) { + for (int32 y_step = 0; y_step < num_y_steps; y_step++) { + int32 patch_number = x_step * num_y_steps + y_step; + int32 patch_start_index = patch_number * filter_dim; + for (int32 x = 0, index = patch_start_index; x < filt_x_dim; x++) { + for (int32 y = 0; y < filt_y_dim; y++) { + for (int32 z = 0; z < input_z_dim; z++, index++) { + int32 vector_index; + if (input_vectorization_ == kZyx) { + vector_index = ZyxVectorIndex(x_step * filt_x_step + x, + y_step * filt_y_step + y, z, + input_x_dim, input_y_dim, + input_z_dim); + } else { + KALDI_ASSERT(input_vectorization_ == kYzx); + vector_index = YzxVectorIndex(x_step * filt_x_step + x, + y_step * filt_y_step + y, z, + input_x_dim, input_y_dim, + input_z_dim); + } + KALDI_ASSERT(vector_index < rev_col_map_size); + reverse_column_map[vector_index].push_back(index); + } + } + } + } + } + std::vector > rearranged_column_map; + RearrangeIndexes(reverse_column_map, &rearranged_column_map); + for (int32 p = 0; p < rearranged_column_map.size(); p++) { + CuArray cu_cols(rearranged_column_map[p]); + in_deriv->AddCols(in_deriv_patches, cu_cols); + } +} + +// back propagation function +// see function declaration in nnet-simple-component.h for details +void ConvolutionComponent::Backprop(const std::string &debug_info, + const ComponentPrecomputedIndexes *indexes, + const CuMatrixBase &in_value, + const CuMatrixBase &, // out_value, + const CuMatrixBase &out_deriv, + void *memo, + Component *to_update_in, + CuMatrixBase *in_deriv) const { + ConvolutionComponent *to_update = + dynamic_cast(to_update_in); + const int32 num_x_steps = (1 + (input_x_dim_ - filt_x_dim_) / filt_x_step_), + num_y_steps = (1 + (input_y_dim_ - filt_y_dim_) / filt_y_step_), + num_filters = filter_params_.NumRows(), + num_frames = out_deriv.NumRows(), + filter_dim = filter_params_.NumCols(); + + KALDI_ASSERT(out_deriv.NumRows() == num_frames && + out_deriv.NumCols() == + (num_filters * num_x_steps * num_y_steps)); + + // Compute inderiv patches + CuMatrix in_deriv_patches(num_frames, + num_x_steps * num_y_steps * filter_dim, + kSetZero); + + std::vector* > patch_deriv_batch, out_deriv_batch, + filter_params_batch; + CuSubMatrix* filter_params_elem = new CuSubMatrix( + filter_params_, 0, filter_params_.NumRows(), 0, filter_params_.NumCols()); + + for (int32 x_step = 0; x_step < num_x_steps; x_step++) { + for (int32 y_step = 0; y_step < num_y_steps; y_step++) { + int32 patch_number = x_step * num_y_steps + y_step; + + patch_deriv_batch.push_back(new CuSubMatrix( + in_deriv_patches.ColRange( + patch_number * filter_dim, filter_dim))); + out_deriv_batch.push_back(new CuSubMatrix(out_deriv.ColRange( + patch_number * num_filters, num_filters))); + filter_params_batch.push_back(filter_params_elem); + } + } + AddMatMatBatched(1.0, patch_deriv_batch, + out_deriv_batch, kNoTrans, + filter_params_batch, kNoTrans, 0.0); + + if (in_deriv) { + // combine the derivatives from the individual input deriv patches + // to compute input deriv matrix + InderivPatchesToInderiv(in_deriv_patches, in_deriv); + } + + if (to_update != NULL) { + to_update->Update(debug_info, in_value, out_deriv, out_deriv_batch); + } + + // release memory + delete filter_params_elem; + for (int32 p = 0; p < patch_deriv_batch.size(); p++) { + delete patch_deriv_batch[p]; + delete out_deriv_batch[p]; + } +} + + +// update parameters +// see function declaration in nnet-simple-component.h for details +void ConvolutionComponent::Update(const std::string &debug_info, + const CuMatrixBase &in_value, + const CuMatrixBase &out_deriv, + const std::vector *>& out_deriv_batch) { + // useful dims + const int32 num_x_steps = (1 + (input_x_dim_ - filt_x_dim_) / filt_x_step_), + num_y_steps = (1 + (input_y_dim_ - filt_y_dim_) / filt_y_step_), + num_filters = filter_params_.NumRows(), + num_frames = out_deriv.NumRows(), + filter_dim = filter_params_.NumCols(); + KALDI_ASSERT(out_deriv.NumRows() == num_frames && + out_deriv.NumCols() == + (num_filters * num_x_steps * num_y_steps)); + + + CuMatrix filters_grad; + CuVector bias_grad; + + CuMatrix input_patches(num_frames, + filter_dim * num_x_steps * num_y_steps, + kUndefined); + InputToInputPatches(in_value, &input_patches); + + filters_grad.Resize(num_filters, filter_dim, kSetZero); // reset + bias_grad.Resize(num_filters, kSetZero); // reset + + // create a single large matrix holding the smaller matrices + // from the vector container filters_grad_batch along the rows + CuMatrix filters_grad_blocks_batch( + num_x_steps * num_y_steps * filters_grad.NumRows(), + filters_grad.NumCols()); + + std::vector* > filters_grad_batch, input_patch_batch; + + for (int32 x_step = 0; x_step < num_x_steps; x_step++) { + for (int32 y_step = 0; y_step < num_y_steps; y_step++) { + int32 patch_number = x_step * num_y_steps + y_step; + filters_grad_batch.push_back(new CuSubMatrix( + filters_grad_blocks_batch.RowRange( + patch_number * filters_grad.NumRows(), filters_grad.NumRows()))); + + input_patch_batch.push_back(new CuSubMatrix( + input_patches.ColRange(patch_number * filter_dim, filter_dim))); + } + } + + AddMatMatBatched(1.0, filters_grad_batch, out_deriv_batch, kTrans, + input_patch_batch, kNoTrans, 1.0); + + // add the row blocks together to filters_grad + filters_grad.AddMatBlocks(1.0, filters_grad_blocks_batch); + + // create a matrix holding the col blocks sum of out_deriv + CuMatrix out_deriv_col_blocks_sum(out_deriv.NumRows(), + num_filters); + + // add the col blocks together to out_deriv_col_blocks_sum + out_deriv_col_blocks_sum.AddMatBlocks(1.0, out_deriv); + + bias_grad.AddRowSumMat(1.0, out_deriv_col_blocks_sum, 1.0); + + // release memory + for (int32 p = 0; p < input_patch_batch.size(); p++) { + delete filters_grad_batch[p]; + delete input_patch_batch[p]; + } + + // + // update + // + filter_params_.AddMat(learning_rate_, filters_grad); + bias_params_.AddVec(learning_rate_, bias_grad); +} + +void ConvolutionComponent::Read(std::istream &is, bool binary) { + ReadUpdatableCommon(is, binary); // Read opening tag and learning rate. + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &input_x_dim_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &input_y_dim_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &input_z_dim_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &filt_x_dim_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &filt_y_dim_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &filt_x_step_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &filt_y_step_); + ExpectToken(is, binary, ""); + int32 input_vectorization; + ReadBasicType(is, binary, &input_vectorization); + input_vectorization_ = static_cast(input_vectorization); + ExpectToken(is, binary, ""); + filter_params_.Read(is, binary); + ExpectToken(is, binary, ""); + bias_params_.Read(is, binary); + std::string tok; + ReadToken(is, binary, &tok); + if (tok == "") { + ReadBasicType(is, binary, &is_gradient_); + ExpectToken(is, binary, ""); + } else { + is_gradient_ = false; + KALDI_ASSERT(tok == ""); + } +} + +void ConvolutionComponent::Write(std::ostream &os, bool binary) const { + WriteUpdatableCommon(os, binary); // write opening tag and learning rate. + WriteToken(os, binary, ""); + WriteBasicType(os, binary, input_x_dim_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, input_y_dim_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, input_z_dim_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, filt_x_dim_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, filt_y_dim_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, filt_x_step_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, filt_y_step_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, static_cast(input_vectorization_)); + WriteToken(os, binary, ""); + filter_params_.Write(os, binary); + WriteToken(os, binary, ""); + bias_params_.Write(os, binary); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, is_gradient_); + WriteToken(os, binary, ""); +} + +BaseFloat ConvolutionComponent::DotProduct(const UpdatableComponent &other_in) const { + const ConvolutionComponent *other = + dynamic_cast(&other_in); + return TraceMatMat(filter_params_, other->filter_params_, kTrans) + + VecVec(bias_params_, other->bias_params_); +} + +Component* ConvolutionComponent::Copy() const { + ConvolutionComponent *ans = new ConvolutionComponent(*this); + return ans; +} + +void ConvolutionComponent::PerturbParams(BaseFloat stddev) { + CuMatrix temp_filter_params(filter_params_); + temp_filter_params.SetRandn(); + filter_params_.AddMat(stddev, temp_filter_params); + + CuVector temp_bias_params(bias_params_); + temp_bias_params.SetRandn(); + bias_params_.AddVec(stddev, temp_bias_params); +} + +void ConvolutionComponent::SetParams(const VectorBase &bias, + const MatrixBase &filter) { + bias_params_ = bias; + filter_params_ = filter; + KALDI_ASSERT(bias_params_.Dim() == filter_params_.NumRows()); +} + +int32 ConvolutionComponent::NumParameters() const { + return (filter_params_.NumCols() + 1) * filter_params_.NumRows(); +} + +void ConvolutionComponent::Vectorize(VectorBase *params) const { + KALDI_ASSERT(params->Dim() == this->NumParameters()); + int32 num_filter_params = filter_params_.NumCols() * filter_params_.NumRows(); + params->Range(0, num_filter_params).CopyRowsFromMat(filter_params_); + params->Range(num_filter_params, bias_params_.Dim()).CopyFromVec(bias_params_); +} +void ConvolutionComponent::UnVectorize(const VectorBase ¶ms) { + KALDI_ASSERT(params.Dim() == this->NumParameters()); + int32 num_filter_params = filter_params_.NumCols() * filter_params_.NumRows(); + filter_params_.CopyRowsFromVec(params.Range(0, num_filter_params)); + bias_params_.CopyFromVec(params.Range(num_filter_params, bias_params_.Dim())); +} + +// aquire input dim +int32 MaxpoolingComponent::InputDim() const { + return input_x_dim_ * input_y_dim_ * input_z_dim_; +} + +MaxpoolingComponent::MaxpoolingComponent( + const MaxpoolingComponent &component): + input_x_dim_(component.input_x_dim_), + input_y_dim_(component.input_y_dim_), + input_z_dim_(component.input_z_dim_), + pool_x_size_(component.pool_x_size_), + pool_y_size_(component.pool_y_size_), + pool_z_size_(component.pool_z_size_), + pool_x_step_(component.pool_x_step_), + pool_y_step_(component.pool_y_step_), + pool_z_step_(component.pool_z_step_) { } + +// aquire output dim +int32 MaxpoolingComponent::OutputDim() const { + int32 num_pools_x = 1 + (input_x_dim_ - pool_x_size_) / pool_x_step_; + int32 num_pools_y = 1 + (input_y_dim_ - pool_y_size_) / pool_y_step_; + int32 num_pools_z = 1 + (input_z_dim_ - pool_z_size_) / pool_z_step_; + return num_pools_x * num_pools_y * num_pools_z; +} + +// check the component parameters +void MaxpoolingComponent::Check() const { + // sanity check of the max pooling parameters + KALDI_ASSERT(input_x_dim_ > 0); + KALDI_ASSERT(input_y_dim_ > 0); + KALDI_ASSERT(input_z_dim_ > 0); + KALDI_ASSERT(pool_x_size_ > 0); + KALDI_ASSERT(pool_y_size_ > 0); + KALDI_ASSERT(pool_z_size_ > 0); + KALDI_ASSERT(pool_x_step_ > 0); + KALDI_ASSERT(pool_y_step_ > 0); + KALDI_ASSERT(pool_z_step_ > 0); + KALDI_ASSERT(input_x_dim_ >= pool_x_size_); + KALDI_ASSERT(input_y_dim_ >= pool_y_size_); + KALDI_ASSERT(input_z_dim_ >= pool_z_size_); + KALDI_ASSERT(pool_x_size_ >= pool_x_step_); + KALDI_ASSERT(pool_y_size_ >= pool_y_step_); + KALDI_ASSERT(pool_z_size_ >= pool_z_step_); + KALDI_ASSERT((input_x_dim_ - pool_x_size_) % pool_x_step_ == 0); + KALDI_ASSERT((input_y_dim_ - pool_y_size_) % pool_y_step_ == 0); + KALDI_ASSERT((input_z_dim_ - pool_z_size_) % pool_z_step_ == 0); +} + +// initialize the component using configuration file +void MaxpoolingComponent::InitFromConfig(ConfigLine *cfl) { + bool ok = true; + + ok = ok && cfl->GetValue("input-x-dim", &input_x_dim_); + ok = ok && cfl->GetValue("input-y-dim", &input_y_dim_); + ok = ok && cfl->GetValue("input-z-dim", &input_z_dim_); + ok = ok && cfl->GetValue("pool-x-size", &pool_x_size_); + ok = ok && cfl->GetValue("pool-y-size", &pool_y_size_); + ok = ok && cfl->GetValue("pool-z-size", &pool_z_size_); + ok = ok && cfl->GetValue("pool-x-step", &pool_x_step_); + ok = ok && cfl->GetValue("pool-y-step", &pool_y_step_); + ok = ok && cfl->GetValue("pool-z-step", &pool_z_step_); + + if (cfl->HasUnusedValues()) + KALDI_ERR << "Could not process these elements in initializer: " + << cfl->UnusedValues(); + if (!ok) + KALDI_ERR << "Bad initializer " << cfl->WholeLine(); + + Check(); +} + +// Method to convert from a matrix representing a minibatch of vectorized +// 3D tensors to patches for 3d max pooling, each patch corresponds to +// the nodes having the same local coordinatenodes from each pool +void MaxpoolingComponent::InputToInputPatches( + const CuMatrixBase& in, + CuMatrix *patches) const{ + int32 num_pools_x = 1 + (input_x_dim_ - pool_x_size_) / pool_x_step_; + int32 num_pools_y = 1 + (input_y_dim_ - pool_y_size_) / pool_y_step_; + int32 num_pools_z = 1 + (input_z_dim_ - pool_z_size_) / pool_z_step_; + + std::vector column_map(patches->NumCols()); + int32 column_map_size = column_map.size(); + for (int32 x = 0, index =0; x < pool_x_size_; x++) { + for (int32 y = 0; y < pool_y_size_; y++) { + for (int32 z = 0; z < pool_z_size_; z++) { + // given the local node coordinate, group them from each pool + // to form a patch + for (int32 x_pool = 0; x_pool < num_pools_x; x_pool++) { + for (int32 y_pool = 0; y_pool < num_pools_y; y_pool++) { + for (int32 z_pool = 0; z_pool < num_pools_z; z_pool++, index++) { + KALDI_ASSERT(index < column_map_size); + column_map[index] = (x_pool * pool_x_step_ + x) * input_y_dim_ * input_z_dim_ + + (y_pool * pool_y_step_ + y) * input_z_dim_ + + (z_pool * pool_z_step_ + z); + + } + } + } + } + } + } + CuArray cu_cols(column_map); + patches->CopyCols(in, cu_cols); +} + +/* + This is the 3d max pooling propagate function. + It is assumed that each row of the input matrix + is a vectorized 3D-tensor of type zxy. + Similar to the propagate function of ConvolutionComponent, + the input matrix is first arranged into patches so that + pools (with / without overlapping) could be + processed in a parallelizable manner. + The output matrix is also a vectorized 3D-tensor of type zxy. +*/ + +void* MaxpoolingComponent::Propagate(const ComponentPrecomputedIndexes *indexes, + const CuMatrixBase &in, + CuMatrixBase *out) const { + int32 num_frames = in.NumRows(); + int32 num_pools = OutputDim(); + int32 pool_size = pool_x_size_ * pool_y_size_ * pool_z_size_; + CuMatrix patches(num_frames, num_pools * pool_size, kUndefined); + InputToInputPatches(in, &patches); + + out->Set(-1e20); // reset a large negative value + for (int32 q = 0; q < pool_size; q++) + out->Max(patches.ColRange(q * num_pools, num_pools)); + return NULL; +} + +// Method to compute the input derivative matrix from the input derivatives +// for patches, where each patch corresponds to +// the nodes having the same local coordinatenodes from each pool +void MaxpoolingComponent::InderivPatchesToInderiv( + const CuMatrix& in_deriv_patches, + CuMatrixBase *in_deriv) const { + + int32 num_pools_x = 1 + (input_x_dim_ - pool_x_size_) / pool_x_step_; + int32 num_pools_y = 1 + (input_y_dim_ - pool_y_size_) / pool_y_step_; + int32 num_pools_z = 1 + (input_z_dim_ - pool_z_size_) / pool_z_step_; + + std::vector > reverse_column_map(in_deriv->NumCols()); + int32 rev_col_map_size = reverse_column_map.size(); + for (int32 x = 0, index = 0; x < pool_x_size_; x++) { + for (int32 y = 0; y < pool_y_size_; y++) { + for (int32 z = 0; z < pool_z_size_; z++) { + + for (int32 x_pool = 0; x_pool < num_pools_x; x_pool++) { + for (int32 y_pool = 0; y_pool < num_pools_y; y_pool++) { + for (int32 z_pool = 0; z_pool < num_pools_z; z_pool++, index++) { + int32 vector_index = (x_pool * pool_x_step_ + x) * input_y_dim_ * input_z_dim_ + + (y_pool * pool_y_step_ + y) * input_z_dim_ + + (z_pool * pool_z_step_ + z); + + KALDI_ASSERT(vector_index < rev_col_map_size); + reverse_column_map[vector_index].push_back(index); + } + } + } + } + } + } + std::vector > rearranged_column_map; + RearrangeIndexes(reverse_column_map, &rearranged_column_map); + for (int32 p = 0; p < rearranged_column_map.size(); p++) { + CuArray cu_cols(rearranged_column_map[p]); + in_deriv->AddCols(in_deriv_patches, cu_cols); + } +} + +/* + 3d max pooling backpropagate function + This function backpropagate the error from + out_deriv to in_deriv. + In order to select the node in each pool to + backpropagate the error, it has to compare + the output pool value stored in the out_value + matrix with each of its input pool member node + stroed in the in_value matrix. +*/ +void MaxpoolingComponent::Backprop(const std::string &debug_info, + const ComponentPrecomputedIndexes *indexes, + const CuMatrixBase &in_value, + const CuMatrixBase &out_value, + const CuMatrixBase &out_deriv, + void *memo, + Component *, // to_update, + CuMatrixBase *in_deriv) const { + if (!in_deriv) + return; + + int32 num_frames = in_value.NumRows(); + int32 num_pools = OutputDim(); + int32 pool_size = pool_x_size_ * pool_y_size_ * pool_z_size_; + CuMatrix patches(num_frames, num_pools * pool_size, kUndefined); + InputToInputPatches(in_value, &patches); + + for (int32 q = 0; q < pool_size; q++) { + // zero-out mask + CuMatrix mask; + out_value.EqualElementMask(patches.ColRange(q * num_pools, num_pools), &mask); + mask.MulElements(out_deriv); + patches.ColRange(q * num_pools, num_pools).CopyFromMat(mask); + } + + // combine the derivatives from the individual input deriv patches + // to compute input deriv matrix + InderivPatchesToInderiv(patches, in_deriv); +} + +void MaxpoolingComponent::Read(std::istream &is, bool binary) { + ExpectOneOrTwoTokens(is, binary, "", ""); + ReadBasicType(is, binary, &input_x_dim_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &input_y_dim_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &input_z_dim_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &pool_x_size_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &pool_y_size_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &pool_z_size_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &pool_x_step_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &pool_y_step_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &pool_z_step_); + ExpectToken(is, binary, ""); + Check(); +} + +void MaxpoolingComponent::Write(std::ostream &os, bool binary) const { + WriteToken(os, binary, ""); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, input_x_dim_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, input_y_dim_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, input_z_dim_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, pool_x_size_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, pool_y_size_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, pool_z_size_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, pool_x_step_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, pool_y_step_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, pool_z_step_); + WriteToken(os, binary, ""); +} + +// display information about component +std::string MaxpoolingComponent::Info() const { + std::ostringstream stream; + stream << Type() + << ", input-x-dim=" << input_x_dim_ + << ", input-y-dim=" << input_y_dim_ + << ", input-z-dim=" << input_z_dim_ + << ", pool-x-size=" << pool_x_size_ + << ", pool-y-size=" << pool_y_size_ + << ", pool-z-size=" << pool_z_size_ + << ", pool-x-step=" << pool_x_step_ + << ", pool-y-step=" << pool_y_step_ + << ", pool-z-step=" << pool_z_step_; + return stream.str(); +} + + +int32 LstmNonlinearityComponent::InputDim() const { + int32 cell_dim = value_sum_.NumCols(); + return cell_dim * 5 + (use_dropout_ ? 3 : 0); +} + +int32 LstmNonlinearityComponent::OutputDim() const { + int32 cell_dim = value_sum_.NumCols(); + return cell_dim * 2; +} + + +void LstmNonlinearityComponent::Read(std::istream &is, bool binary) { + ReadUpdatableCommon(is, binary); // Read opening tag and learning rate. + ExpectToken(is, binary, ""); + params_.Read(is, binary); + ExpectToken(is, binary, ""); + value_sum_.Read(is, binary); + ExpectToken(is, binary, ""); + deriv_sum_.Read(is, binary); + ExpectToken(is, binary, ""); + self_repair_config_.Read(is, binary); + ExpectToken(is, binary, ""); + self_repair_total_.Read(is, binary); + + std::string tok; + ReadToken(is, binary, &tok); + if (tok == "") { + ReadBasicType(is, binary, &use_dropout_); + ReadToken(is, binary, &tok); + } else { + use_dropout_ = false; + } + KALDI_ASSERT(tok == ""); + ReadBasicType(is, binary, &count_); + + // For the on-disk format, we normalze value_sum_, deriv_sum_ and + // self_repair_total_ by dividing by the count, but in memory they are scaled + // by the count. [for self_repair_total_, the scaling factor is count_ * + // cell_dim]. + value_sum_.Scale(count_); + deriv_sum_.Scale(count_); + int32 cell_dim = params_.NumCols(); + self_repair_total_.Scale(count_ * cell_dim); + + InitNaturalGradient(); + + ExpectToken(is, binary, ""); + +} + +void LstmNonlinearityComponent::Write(std::ostream &os, bool binary) const { + WriteUpdatableCommon(os, binary); // Read opening tag and learning rate. + + WriteToken(os, binary, ""); + params_.Write(os, binary); + WriteToken(os, binary, ""); + { + Matrix value_avg(value_sum_); + if (count_ != 0.0) + value_avg.Scale(1.0 / count_); + value_avg.Write(os, binary); + } + WriteToken(os, binary, ""); + { + Matrix deriv_avg(deriv_sum_); + if (count_ != 0.0) + deriv_avg.Scale(1.0 / count_); + deriv_avg.Write(os, binary); + } + WriteToken(os, binary, ""); + self_repair_config_.Write(os, binary); + WriteToken(os, binary, ""); + { + int32 cell_dim = params_.NumCols(); + Vector self_repair_prob(self_repair_total_); + if (count_ != 0.0) + self_repair_prob.Scale(1.0 / (count_ * cell_dim)); + self_repair_prob.Write(os, binary); + } + if (use_dropout_) { + // only write this if true; we have back-compat code in reading anyway. + // this makes the models without dropout easier to read with older code. + WriteToken(os, binary, ""); + WriteBasicType(os, binary, use_dropout_); + } + WriteToken(os, binary, ""); + WriteBasicType(os, binary, count_); + WriteToken(os, binary, ""); +} + + + +std::string LstmNonlinearityComponent::Info() const { + std::ostringstream stream; + int32 cell_dim = params_.NumCols(); + stream << UpdatableComponent::Info() << ", cell-dim=" << cell_dim + << ", use-dropout=" << (use_dropout_ ? "true" : "false"); + PrintParameterStats(stream, "w_ic", params_.Row(0)); + PrintParameterStats(stream, "w_fc", params_.Row(1)); + PrintParameterStats(stream, "w_oc", params_.Row(2)); + + // Note: some of the following code mirrors the code in + // UpdatableComponent::Info(), in nnet-component-itf.cc. + if (count_ > 0) { + stream << ", count=" << std::setprecision(3) << count_ + << std::setprecision(6); + } + static const char *nonlin_names[] = { "i_t_sigmoid", "f_t_sigmoid", "c_t_tanh", + "o_t_sigmoid", "m_t_tanh" }; + for (int32 i = 0; i < 5; i++) { + stream << ", " << nonlin_names[i] << "={"; + stream << " self-repair-lower-threshold=" << self_repair_config_(i) + << ", self-repair-scale=" << self_repair_config_(i + 5); + + if (count_ != 0) { + BaseFloat self_repaired_proportion = + self_repair_total_(i) / (count_ * cell_dim); + stream << ", self-repaired-proportion=" << self_repaired_proportion; + Vector value_sum(value_sum_.Row(i)), + deriv_sum(deriv_sum_.Row(i)); + Vector value_avg(value_sum), deriv_avg(deriv_sum); + value_avg.Scale(1.0 / count_); + deriv_avg.Scale(1.0 / count_); + stream << ", value-avg=" << SummarizeVector(value_avg) + << ", deriv-avg=" << SummarizeVector(deriv_avg); + } + stream << " }"; + } + return stream.str(); +} + + +Component* LstmNonlinearityComponent::Copy() const { + return new LstmNonlinearityComponent(*this); +} + +void LstmNonlinearityComponent::ZeroStats() { + value_sum_.SetZero(); + deriv_sum_.SetZero(); + self_repair_total_.SetZero(); + count_ = 0.0; +} + +void LstmNonlinearityComponent::Scale(BaseFloat scale) { + if (scale == 0.0) { + params_.SetZero(); + value_sum_.SetZero(); + deriv_sum_.SetZero(); + self_repair_total_.SetZero(); + count_ = 0.0; + } else { + params_.Scale(scale); + value_sum_.Scale(scale); + deriv_sum_.Scale(scale); + self_repair_total_.Scale(scale); + count_ *= scale; + } +} + +void LstmNonlinearityComponent::Add(BaseFloat alpha, + const Component &other_in) { + const LstmNonlinearityComponent *other = + dynamic_cast(&other_in); + KALDI_ASSERT(other != NULL); + params_.AddMat(alpha, other->params_); + value_sum_.AddMat(alpha, other->value_sum_); + deriv_sum_.AddMat(alpha, other->deriv_sum_); + self_repair_total_.AddVec(alpha, other->self_repair_total_); + count_ += alpha * other->count_; +} + +void LstmNonlinearityComponent::PerturbParams(BaseFloat stddev) { + CuMatrix temp_params(params_.NumRows(), params_.NumCols()); + temp_params.SetRandn(); + params_.AddMat(stddev, temp_params); +} + +BaseFloat LstmNonlinearityComponent::DotProduct( + const UpdatableComponent &other_in) const { + const LstmNonlinearityComponent *other = + dynamic_cast(&other_in); + KALDI_ASSERT(other != NULL); + return TraceMatMat(params_, other->params_, kTrans); +} + +int32 LstmNonlinearityComponent::NumParameters() const { + return params_.NumRows() * params_.NumCols(); +} + +void LstmNonlinearityComponent::Vectorize(VectorBase *params) const { + KALDI_ASSERT(params->Dim() == NumParameters()); + params->CopyRowsFromMat(params_); +} + + +void LstmNonlinearityComponent::UnVectorize( + const VectorBase ¶ms) { + KALDI_ASSERT(params.Dim() == NumParameters()); + params_.CopyRowsFromVec(params); +} + + +void* LstmNonlinearityComponent::Propagate( + const ComponentPrecomputedIndexes *, // indexes + const CuMatrixBase &in, + CuMatrixBase *out) const { + cu::ComputeLstmNonlinearity(in, params_, out); + return NULL; +} + + +void LstmNonlinearityComponent::Backprop( + const std::string &debug_info, + const ComponentPrecomputedIndexes *indexes, + const CuMatrixBase &in_value, + const CuMatrixBase &, // out_value, + const CuMatrixBase &out_deriv, + void *memo, + Component *to_update_in, + CuMatrixBase *in_deriv) const { + + if (to_update_in == NULL) { + cu::BackpropLstmNonlinearity(in_value, params_, out_deriv, + deriv_sum_, self_repair_config_, + count_, in_deriv, + (CuMatrixBase*) NULL, + (CuMatrixBase*) NULL, + (CuMatrixBase*) NULL, + (CuMatrixBase*) NULL); + } else { + LstmNonlinearityComponent *to_update = + dynamic_cast(to_update_in); + KALDI_ASSERT(to_update != NULL); + + int32 cell_dim = params_.NumCols(); + CuMatrix params_deriv(3, cell_dim, kUndefined); + CuMatrix self_repair_total(5, cell_dim, kUndefined); + + cu::BackpropLstmNonlinearity(in_value, params_, out_deriv, + deriv_sum_, self_repair_config_, + count_, in_deriv, ¶ms_deriv, + &(to_update->value_sum_), + &(to_update->deriv_sum_), + &self_repair_total); + + CuVector self_repair_total_sum(5); + self_repair_total_sum.AddColSumMat(1.0, self_repair_total, 0.0); + to_update->self_repair_total_.AddVec(1.0, self_repair_total_sum); + to_update->count_ += static_cast(in_value.NumRows()); + + BaseFloat scale = 1.0; + if (!to_update->is_gradient_) { + to_update->preconditioner_.PreconditionDirections( + ¶ms_deriv, &scale); + } + to_update->params_.AddMat(to_update->learning_rate_ * scale, + params_deriv); + } +} + +LstmNonlinearityComponent::LstmNonlinearityComponent( + const LstmNonlinearityComponent &other): + UpdatableComponent(other), + params_(other.params_), + use_dropout_(other.use_dropout_), + value_sum_(other.value_sum_), + deriv_sum_(other.deriv_sum_), + self_repair_config_(other.self_repair_config_), + self_repair_total_(other.self_repair_total_), + count_(other.count_), + preconditioner_(other.preconditioner_) { } + +void LstmNonlinearityComponent::Init( + int32 cell_dim, bool use_dropout, + BaseFloat param_stddev, + BaseFloat tanh_self_repair_threshold, + BaseFloat sigmoid_self_repair_threshold, + BaseFloat self_repair_scale) { + KALDI_ASSERT(cell_dim > 0 && param_stddev >= 0.0 && + tanh_self_repair_threshold >= 0.0 && + tanh_self_repair_threshold <= 1.0 && + sigmoid_self_repair_threshold >= 0.0 && + sigmoid_self_repair_threshold <= 0.25 && + self_repair_scale >= 0.0 && self_repair_scale <= 0.1); + use_dropout_ = use_dropout; + params_.Resize(3, cell_dim); + params_.SetRandn(); + params_.Scale(param_stddev); + value_sum_.Resize(5, cell_dim); + deriv_sum_.Resize(5, cell_dim); + self_repair_config_.Resize(10); + self_repair_config_.Range(0, 5).Set(sigmoid_self_repair_threshold); + self_repair_config_(2) = tanh_self_repair_threshold; + self_repair_config_(4) = tanh_self_repair_threshold; + self_repair_config_.Range(5, 5).Set(self_repair_scale); + self_repair_total_.Resize(5); + count_ = 0.0; + InitNaturalGradient(); + +} + +void LstmNonlinearityComponent::InitNaturalGradient() { + // As regards the configuration for the natural-gradient preconditioner, we + // don't make it configurable from the command line-- it's unlikely that any + // differences from changing this would be substantial enough to effectively + // tune the configuration. Because the preconditioning code doesn't 'see' the + // derivatives from individual frames, but only averages over the minibatch, + // there is a fairly small amount of data available to estimate the Fisher + // information matrix, so we set the rank, update period and + // num-samples-history to smaller values than normal. + preconditioner_.SetRank(20); + preconditioner_.SetUpdatePeriod(2); + preconditioner_.SetNumSamplesHistory(1000.0); +} + +/// virtual +void LstmNonlinearityComponent::FreezeNaturalGradient(bool freeze) { + preconditioner_.Freeze(freeze); +} + +void LstmNonlinearityComponent::InitFromConfig(ConfigLine *cfl) { + InitLearningRatesFromConfig(cfl); + bool ok = true; + bool use_dropout = false; + int32 cell_dim; + // these self-repair thresholds are the normal defaults for tanh and sigmoid + // respectively. If, later on, we decide that we want to support different + // self-repair config values for the individual sigmoid and tanh + // nonlinearities, we can modify this code then. + BaseFloat tanh_self_repair_threshold = 0.2, + sigmoid_self_repair_threshold = 0.05, + self_repair_scale = 1.0e-05; + // param_stddev is the stddev of the parameters. it may be better to + // use a smaller value but this was the default in the python scripts + // for a while. + BaseFloat param_stddev = 1.0; + ok = ok && cfl->GetValue("cell-dim", &cell_dim); + cfl->GetValue("param-stddev", ¶m_stddev); + cfl->GetValue("tanh-self-repair-threshold", + &tanh_self_repair_threshold); + cfl->GetValue("sigmoid-self-repair-threshold", + &sigmoid_self_repair_threshold); + cfl->GetValue("self-repair-scale", &self_repair_scale); + cfl->GetValue("use-dropout", &use_dropout); + + // We may later on want to make it possible to initialize the different + // parameters w_ic, w_fc and w_oc with different biases. We'll implement + // that when and if it's needed. + + if (cfl->HasUnusedValues()) + KALDI_ERR << "Could not process these elements in initializer: " + << cfl->UnusedValues(); + if (ok) { + Init(cell_dim, use_dropout, param_stddev, tanh_self_repair_threshold, + sigmoid_self_repair_threshold, self_repair_scale); + } else { + KALDI_ERR << "Invalid initializer for layer of type " + << Type() << ": \"" << cfl->WholeLine() << "\""; + } +} + +void LstmNonlinearityComponent::ConsolidateMemory() { + OnlineNaturalGradient preconditioner_temp(preconditioner_); + preconditioner_.Swap(&preconditioner_); +} + + +int32 GruNonlinearityComponent::InputDim() const { + if (recurrent_dim_ == cell_dim_) { + // non-projected GRU. + return 4 * cell_dim_; + } else { + return 3 * cell_dim_ + 2 * recurrent_dim_; + } +} + +int32 GruNonlinearityComponent::OutputDim() const { + return 2 * cell_dim_; +} + + +std::string GruNonlinearityComponent::Info() const { + std::ostringstream stream; + stream << UpdatableComponent::Info() + << ", cell-dim=" << cell_dim_ + << ", recurrent-dim=" << recurrent_dim_; + PrintParameterStats(stream, "w_h", w_h_); + stream << ", self-repair-threshold=" << self_repair_threshold_ + << ", self-repair-scale=" << self_repair_scale_; + if (count_ > 0) { // c.f. NonlinearComponent::Info(). + stream << ", count=" << std::setprecision(3) << count_ + << std::setprecision(6); + stream << ", self-repaired-proportion=" + << (self_repair_total_ / (count_ * cell_dim_)); + Vector value_avg_dbl(value_sum_); + Vector value_avg(value_avg_dbl); + value_avg.Scale(1.0 / count_); + stream << ", value-avg=" << SummarizeVector(value_avg); + Vector deriv_avg_dbl(deriv_sum_); + Vector deriv_avg(deriv_avg_dbl); + deriv_avg.Scale(1.0 / count_); + stream << ", deriv-avg=" << SummarizeVector(deriv_avg); + } + // natural-gradient parameters. + stream << ", alpha=" << preconditioner_in_.GetAlpha() + << ", rank-in=" << preconditioner_in_.GetRank() + << ", rank-out=" << preconditioner_out_.GetRank() + << ", update-period=" + << preconditioner_in_.GetUpdatePeriod(); + return stream.str(); +} + +void GruNonlinearityComponent::InitFromConfig(ConfigLine *cfl) { + cell_dim_ = -1; + recurrent_dim_ = -1; + self_repair_threshold_ = 0.2; + self_repair_scale_ = 1.0e-05; + + InitLearningRatesFromConfig(cfl); + if (!cfl->GetValue("cell-dim", &cell_dim_) || cell_dim_ <= 0) + KALDI_ERR << "cell-dim > 0 is required for GruNonlinearityComponent."; + + BaseFloat param_stddev = 1.0 / std::sqrt(cell_dim_), + alpha = 4.0; + int32 rank_in = 20, rank_out = 80, + update_period = 4; + + cfl->GetValue("recurrent-dim", &recurrent_dim_); + cfl->GetValue("self-repair-threshold", &self_repair_threshold_); + cfl->GetValue("self-repair-scale", &self_repair_scale_); + cfl->GetValue("param-stddev", ¶m_stddev); + cfl->GetValue("alpha", &alpha); + cfl->GetValue("rank-in", &rank_in); + cfl->GetValue("rank-out", &rank_out); + cfl->GetValue("update-period", &update_period); + + if (recurrent_dim_ < 0) + recurrent_dim_ = cell_dim_; + if (recurrent_dim_ == 0 || recurrent_dim_ > cell_dim_) + KALDI_ERR << "Invalid values for cell-dim and recurrent-dim"; + + w_h_.Resize(cell_dim_, recurrent_dim_); + w_h_.SetRandn(); + w_h_.Scale(param_stddev); + + preconditioner_in_.SetAlpha(alpha); + preconditioner_in_.SetRank(rank_in); + preconditioner_in_.SetUpdatePeriod(update_period); + preconditioner_out_.SetAlpha(alpha); + preconditioner_out_.SetRank(rank_out); + preconditioner_out_.SetUpdatePeriod(update_period); + + count_ = 0.0; + self_repair_total_ = 0.0; + value_sum_.Resize(cell_dim_); + deriv_sum_.Resize(cell_dim_); + + Check(); +} + +void* GruNonlinearityComponent::Propagate( + const ComponentPrecomputedIndexes *indexes, + const CuMatrixBase &in, + CuMatrixBase *out) const { + KALDI_ASSERT(in.NumRows() == out->NumRows() && + in.NumCols() == InputDim() && + out->NumCols() == OutputDim()); + // If recurrent_dim_ != cell_dim_, this is projected GRU and we + // are computing: + // (z_t, r_t, hpart_t, c_{t-1}, s_{t-1}) -> (h_t, c_t). + // Otherwise (no projection), it's + // (z_t, r_t, hpart_t, y_{t-1},) -> (h_t, y_t). + // but to understand this code, it's better to rename y to c: + // (z_t, r_t, hpart_t, c_{t-1}) -> (h_t, c_t). + int32 num_rows = in.NumRows(), + c = cell_dim_, + r = recurrent_dim_; + CuSubMatrix z_t(in, 0, num_rows, 0, c), + r_t(in, 0, num_rows, c, r), + hpart_t(in, 0, num_rows, c + r, c), + c_t1(in, 0, num_rows, c + r + c, c); + // note: the variable named 'c_t1' actually represents + // y_{t-1} for non-projected GRUs. + + // By setting s_t1 to the last recurrent_dim_ rows of 'in', we get something + // that represents s_{t-1} for recurrent setups and y_{t-1} (which we're + // renaming to c_{t-1}) for non-projected GRUs. The key thing is that + // in the non-projected case, the variables c_t1 and s_t1 point to the + // same memory. + CuSubMatrix s_t1(in, 0, num_rows, in.NumCols() - r, r); + + // note: for non-projected GRUs, c_t below is actually y_t. + CuSubMatrix h_t(*out, 0, num_rows, 0, c), + c_t(*out, 0, num_rows, c, c); + + // sdotr is the only temporary storage we need in the forward pass. + CuMatrix sdotr(num_rows, r); + sdotr.AddMatMatElements(1.0, r_t, s_t1, 0.0); + // now sdotr = r_t \dot s_{t-1}. + h_t.CopyFromMat(hpart_t); + // now h_t = hpart_t (note: hpart_t actually means U^h x_t). + h_t.AddMatMat(1.0, sdotr, kNoTrans, w_h_, kTrans, 1.0); + // now h_t = hpart_t + W^h (s_{t-1} \dot r_t). + h_t.Tanh(h_t); + // now, h_t = tanh(hpart_t + W^h (s_{t-1} \dot r_t)). + + c_t.CopyFromMat(h_t); + // now c_t = h_t + c_t.AddMatMatElements(-1.0, z_t, h_t, 1.0); + // now c_t = (1 - z_t) \dot h_t. + c_t.AddMatMatElements(1.0, z_t, c_t1, 1.0); + // now c_t = (1 - z_t) \dot h_t + z_t \dot c_{t-1}. + return NULL; +} + +void GruNonlinearityComponent::Backprop( + const std::string &debug_info, + const ComponentPrecomputedIndexes *, // indexes + const CuMatrixBase &in_value, + const CuMatrixBase &out_value, + const CuMatrixBase &out_deriv, + void *memo, + Component *to_update_in, + CuMatrixBase *in_deriv) const { + KALDI_ASSERT(SameDim(out_value, out_deriv) && + in_value.NumRows() == out_value.NumRows() && + in_value.NumCols() == InputDim() && + out_value.NumCols() == OutputDim() && + (in_deriv == NULL || SameDim(in_value, *in_deriv)) && + memo == NULL); + GruNonlinearityComponent *to_update = + dynamic_cast(to_update_in); + KALDI_ASSERT(in_deriv != NULL || to_update != NULL); + int32 num_rows = in_value.NumRows(), + c = cell_dim_, + r = recurrent_dim_; + + // To understand what's going on here, compare this code with the + // corresponding 'forward' code in Propagate(). + + + CuSubMatrix z_t(in_value, 0, num_rows, 0, c), + r_t(in_value, 0, num_rows, c, r), + hpart_t(in_value, 0, num_rows, c + r, c), + c_t1(in_value, 0, num_rows, c + r + c, c), + s_t1(in_value, 0, num_rows, in_value.NumCols() - r, r); + + + // The purpose of this 'in_deriv_ptr' is so that we can create submatrices + // like z_t_deriv without the code crashing. If in_deriv is NULL these point + // to 'in_value', and we'll be careful never to actually write to these + // sub-matrices, which aside from being conceptually wrong would violate the + // const semantics of this function. + const CuMatrixBase *in_deriv_ptr = + (in_deriv == NULL ? &in_value : in_deriv); + CuSubMatrix z_t_deriv(*in_deriv_ptr, 0, num_rows, 0, c), + r_t_deriv(*in_deriv_ptr, 0, num_rows, c, r), + hpart_t_deriv(*in_deriv_ptr, 0, num_rows, c + r, c), + c_t1_deriv(*in_deriv_ptr, 0, num_rows, c + r + c, c), + s_t1_deriv(*in_deriv_ptr, 0, num_rows, in_value.NumCols() - r, r); + + // Note: the output h_t is never actually used in the GRU computation (we only + // output it because we want the value to be cached to save computation in the + // backprop), so we expect that the 'h_t_deriv', if we extracted it in the + // obvious way, would be all zeros. + // We create a different, local h_t_deriv + // variable that backpropagates the derivative from c_t_deriv. + CuSubMatrix h_t(out_value, 0, num_rows, 0, c), + c_t(out_value, 0, num_rows, c, c), + c_t_deriv(out_deriv, 0, num_rows, c, c); + CuMatrix h_t_deriv(num_rows, c, kUndefined); + + { // we initialize h_t_deriv with the derivative from 'out_deriv'. + // In real life in a GRU, this would always be zero; but in testing + // code it may be nonzero and we include this term so that + // the tests don't fail. Note: if you were to remove these + // lines, you'd have to change 'h_t_deriv.AddMat(1.0, c_t_deriv);' below + // to a CopyFromMat() call. + CuSubMatrix h_t_deriv_in(out_deriv, 0, num_rows, 0, c); + h_t_deriv.CopyFromMat(h_t_deriv_in); + } + + + // sdotr is the same variable as used in the forward pass, it will contain + // r_t \dot s_{t-1}. + CuMatrix sdotr(num_rows, r); + sdotr.AddMatMatElements(1.0, r_t, s_t1, 0.0); + + + { // This block does the + // backprop corresponding to the + // forward-pass expression: c_t = (1 - z_t) \dot h_t + z_t \dot c_{t-1}. + + // First do: h_t_deriv = c_t_deriv \dot (1 - z_t). + h_t_deriv.AddMat(1.0, c_t_deriv); + h_t_deriv.AddMatMatElements(-1.0, c_t_deriv, z_t, 1.0); + + if (in_deriv) { + // these should be self-explanatory if you study + // the expression "c_t = (1 - z_t) \dot h_t + z_t \dot c_{t-1}". + z_t_deriv.AddMatMatElements(-1.0, c_t_deriv, h_t, 1.0); + z_t_deriv.AddMatMatElements(1.0, c_t_deriv, c_t1, 1.0); + c_t1_deriv.AddMatMatElements(1.0, c_t_deriv, z_t, 1.0); + } + } + + h_t_deriv.DiffTanh(h_t, h_t_deriv); + if (to_update) + to_update->TanhStatsAndSelfRepair(h_t, &h_t_deriv); + + + if (to_update) + to_update->UpdateParameters(sdotr, h_t_deriv); + + // At this point, 'h_t_deriv' contains the derivative w.r.t. + // the argument of the tanh function, i.e. w.r.t. the expression: + // hpart_t + W^h (s_{t-1} \dot r_t). + // The next block propagates this to the derivatives for + // hpart_t, s_{t-1} and r_t. + if (in_deriv) { + hpart_t_deriv.AddMat(1.0, h_t_deriv); + + // We re-use the memory that we used for s_{t-1} \dot r_t, + // for its derivative. + CuMatrix &sdotr_deriv(sdotr); + sdotr_deriv.AddMatMat(1.0, h_t_deriv, kNoTrans, w_h_, kNoTrans, 0.0); + + // we add to all the input-derivatives instead of setting them, + // because we chose to export the flag kBackpropAdds. + r_t_deriv.AddMatMatElements(1.0, sdotr_deriv, s_t1, 1.0); + s_t1_deriv.AddMatMatElements(1.0, sdotr_deriv, r_t, 1.0); + } +} + + +void GruNonlinearityComponent::TanhStatsAndSelfRepair( + const CuMatrixBase &h_t, + CuMatrixBase *h_t_deriv) { + KALDI_ASSERT(SameDim(h_t, *h_t_deriv)); + + // we use this probability (hardcoded for now) to limit the stats accumulation + // and self-repair code to running on about half of the minibatches. + BaseFloat repair_and_stats_probability = 0.5; + if (RandUniform() > repair_and_stats_probability) + return; + + // OK, accumulate stats. + // For the next few lines, compare with TanhComponent::StoreStats(), which is where + // we got this code. + // tanh_deriv is the function derivative of the tanh function, + // tanh'(x) = tanh(x) * (1.0 - tanh(x)). h_t corresponds to tanh(x). + CuMatrix tanh_deriv(h_t); + tanh_deriv.ApplyPow(2.0); + tanh_deriv.Scale(-1.0); + tanh_deriv.Add(1.0); + + count_ += h_t.NumRows(); + CuVector temp(cell_dim_); + temp.AddRowSumMat(1.0, h_t, 0.0); + value_sum_.AddVec(1.0, temp); + temp.AddRowSumMat(1.0, tanh_deriv, 0.0); + deriv_sum_.AddVec(1.0, temp); + + if (count_ <= 0.0) { + // this would be rather pathological if it happened. + return; + } + + // The rest of this function contains code modified from + // TanhComponent::RepairGradients(). + + // thresholds_vec is actually a 1-row matrix. (the ApplyHeaviside + // function isn't defined for vectors). + CuMatrix thresholds(1, cell_dim_); + CuSubVector thresholds_vec(thresholds, 0); + thresholds_vec.AddVec(-1.0, deriv_sum_); + thresholds_vec.Add(self_repair_threshold_ * count_); + thresholds.ApplyHeaviside(); + self_repair_total_ += thresholds_vec.Sum(); + + // there is a comment explaining what we are doing with + // 'thresholds_vec', at this point in TanhComponent::RepairGradients(). + // We won't repeat it here. + + h_t_deriv->AddMatDiagVec(-self_repair_scale_ / repair_and_stats_probability, + h_t, kNoTrans, thresholds_vec); +} + +void GruNonlinearityComponent::UpdateParameters( + const CuMatrixBase &sdotr, + const CuMatrixBase &h_t_deriv) { + if (is_gradient_) { + // 'simple' update, no natural gradient. Compare + // with AffineComponent::UpdateSimple(). + w_h_.AddMatMat(learning_rate_, h_t_deriv, kTrans, + sdotr, kNoTrans, 1.0); + } else { + // the natural-gradient update. + CuMatrix in_value_temp(sdotr), + out_deriv_temp(h_t_deriv); + + // These "scale" values get will get multiplied into the learning rate. + BaseFloat in_scale, out_scale; + + preconditioner_in_.PreconditionDirections(&in_value_temp, &in_scale); + preconditioner_out_.PreconditionDirections(&out_deriv_temp, &out_scale); + + BaseFloat local_lrate = learning_rate_ * in_scale * out_scale; + w_h_.AddMatMat(local_lrate, out_deriv_temp, kTrans, + in_value_temp, kNoTrans, 1.0); + } +} + + + +void GruNonlinearityComponent::Read(std::istream &is, bool binary) { + ReadUpdatableCommon(is, binary); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &cell_dim_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &recurrent_dim_); + ExpectToken(is, binary, ""); + w_h_.Read(is, binary); + ExpectToken(is, binary, ""); + value_sum_.Read(is, binary); + ExpectToken(is, binary, ""); + deriv_sum_.Read(is, binary); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &self_repair_total_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &count_); + value_sum_.Scale(count_); // we read in the averages, not the sums. + deriv_sum_.Scale(count_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &self_repair_threshold_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &self_repair_scale_); + BaseFloat alpha; + int32 rank_in, rank_out, update_period; + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &alpha); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &rank_in); + ReadBasicType(is, binary, &rank_out); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &update_period); + preconditioner_in_.SetRank(rank_in); + preconditioner_out_.SetRank(rank_out); + preconditioner_in_.SetAlpha(alpha); + preconditioner_out_.SetAlpha(alpha); + preconditioner_in_.SetUpdatePeriod(update_period); + preconditioner_out_.SetUpdatePeriod(update_period); + ExpectToken(is, binary, ""); +} + +void GruNonlinearityComponent::Write(std::ostream &os, bool binary) const { + WriteUpdatableCommon(os, binary); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, cell_dim_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, recurrent_dim_); + WriteToken(os, binary, ""); + w_h_.Write(os, binary); + { + // Write the value and derivative stats in a count-normalized way, for + // greater readability in text form. + WriteToken(os, binary, ""); + Vector temp(value_sum_); + if (count_ != 0.0) temp.Scale(1.0 / count_); + temp.Write(os, binary); + WriteToken(os, binary, ""); + temp.CopyFromVec(deriv_sum_); + if (count_ != 0.0) temp.Scale(1.0 / count_); + temp.Write(os, binary); + } + WriteToken(os, binary, ""); + WriteBasicType(os, binary, self_repair_total_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, count_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, self_repair_threshold_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, self_repair_scale_); + + BaseFloat alpha = preconditioner_in_.GetAlpha(); + int32 rank_in = preconditioner_in_.GetRank(), + rank_out = preconditioner_out_.GetRank(), + update_period = preconditioner_in_.GetUpdatePeriod(); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, alpha); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, rank_in); + WriteBasicType(os, binary, rank_out); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, update_period); + WriteToken(os, binary, ""); +} + +void GruNonlinearityComponent::Scale(BaseFloat scale) { + if (scale == 0.0) { + w_h_.SetZero(); + value_sum_.SetZero(); + deriv_sum_.SetZero(); + self_repair_total_ = 0.0; + count_ = 0.0; + } else { + w_h_.Scale(scale); + value_sum_.Scale(scale); + deriv_sum_.Scale(scale); + self_repair_total_ *= scale; + count_ *= scale; + } +} + +void GruNonlinearityComponent::Add(BaseFloat alpha, + const Component &other_in) { + const GruNonlinearityComponent *other = + dynamic_cast(&other_in); + KALDI_ASSERT(other != NULL); + w_h_.AddMat(alpha, other->w_h_); + value_sum_.AddVec(alpha, other->value_sum_); + deriv_sum_.AddVec(alpha, other->deriv_sum_); + self_repair_total_ += alpha * other->self_repair_total_; + count_ += alpha * other->count_; +} + +void GruNonlinearityComponent::ZeroStats() { + value_sum_.SetZero(); + deriv_sum_.SetZero(); + self_repair_total_ = 0.0; + count_ = 0.0; +} + +void GruNonlinearityComponent::Check() const { + KALDI_ASSERT(cell_dim_ > 0 && recurrent_dim_ > 0 && + recurrent_dim_ <= cell_dim_ && + self_repair_threshold_ >= 0.0 && + self_repair_scale_ >= 0.0 ); + KALDI_ASSERT(w_h_.NumRows() == cell_dim_ && + w_h_.NumCols() == recurrent_dim_); + KALDI_ASSERT(value_sum_.Dim() == cell_dim_ && + deriv_sum_.Dim() == cell_dim_); +} + +void GruNonlinearityComponent::PerturbParams(BaseFloat stddev) { + CuMatrix temp_params(w_h_.NumRows(), w_h_.NumCols()); + temp_params.SetRandn(); + w_h_.AddMat(stddev, temp_params); +} + +BaseFloat GruNonlinearityComponent::DotProduct( + const UpdatableComponent &other_in) const { + const GruNonlinearityComponent *other = + dynamic_cast(&other_in); + KALDI_ASSERT(other != NULL); + return TraceMatMat(w_h_, other->w_h_, kTrans); +} + +int32 GruNonlinearityComponent::NumParameters() const { + return w_h_.NumRows() * w_h_.NumCols(); +} + +void GruNonlinearityComponent::Vectorize(VectorBase *params) const { + KALDI_ASSERT(params->Dim() == NumParameters()); + params->CopyRowsFromMat(w_h_); +} + + +void GruNonlinearityComponent::UnVectorize( + const VectorBase ¶ms) { + KALDI_ASSERT(params.Dim() == NumParameters()); + w_h_.CopyRowsFromVec(params); +} + +void GruNonlinearityComponent::FreezeNaturalGradient(bool freeze) { + preconditioner_in_.Freeze(freeze); + preconditioner_out_.Freeze(freeze); +} + +GruNonlinearityComponent::GruNonlinearityComponent( + const GruNonlinearityComponent &other): + UpdatableComponent(other), + cell_dim_(other.cell_dim_), + recurrent_dim_(other.recurrent_dim_), + w_h_(other.w_h_), + value_sum_(other.value_sum_), + deriv_sum_(other.deriv_sum_), + self_repair_total_(other.self_repair_total_), + count_(other.count_), + self_repair_threshold_(other.self_repair_threshold_), + self_repair_scale_(other.self_repair_scale_), + preconditioner_in_(other.preconditioner_in_), + preconditioner_out_(other.preconditioner_out_) { + Check(); +} + + +int32 OutputGruNonlinearityComponent::InputDim() const { + return 3 * cell_dim_; +} + +int32 OutputGruNonlinearityComponent::OutputDim() const { + return 2 * cell_dim_; +} + + +std::string OutputGruNonlinearityComponent::Info() const { + std::ostringstream stream; + stream << UpdatableComponent::Info() + << ", cell-dim=" << cell_dim_; + PrintParameterStats(stream, "w_h", w_h_); + stream << ", self-repair-threshold=" << self_repair_threshold_ + << ", self-repair-scale=" << self_repair_scale_; + if (count_ > 0) { // c.f. NonlinearComponent::Info(). + stream << ", count=" << std::setprecision(3) << count_ + << std::setprecision(6); + stream << ", self-repaired-proportion=" + << (self_repair_total_ / (count_ * cell_dim_)); + Vector value_avg_dbl(value_sum_); + Vector value_avg(value_avg_dbl); + value_avg.Scale(1.0 / count_); + stream << ", value-avg=" << SummarizeVector(value_avg); + Vector deriv_avg_dbl(deriv_sum_); + Vector deriv_avg(deriv_avg_dbl); + deriv_avg.Scale(1.0 / count_); + stream << ", deriv-avg=" << SummarizeVector(deriv_avg); + } + // natural-gradient parameters. + stream << ", alpha=" << preconditioner_.GetAlpha() + << ", rank=" << preconditioner_.GetRank() + << ", update-period=" + << preconditioner_.GetUpdatePeriod(); + return stream.str(); +} + +void OutputGruNonlinearityComponent::InitFromConfig(ConfigLine *cfl) { + cell_dim_ = -1; + self_repair_threshold_ = 0.2; + self_repair_scale_ = 1.0e-05; + + InitLearningRatesFromConfig(cfl); + if (!cfl->GetValue("cell-dim", &cell_dim_) || cell_dim_ <= 0) + KALDI_ERR << "cell-dim > 0 is required for GruNonlinearityComponent."; + + BaseFloat param_mean = 0.0, param_stddev = 1.0, + alpha = 4.0; + int32 rank=8, + update_period = 10; + + cfl->GetValue("self-repair-threshold", &self_repair_threshold_); + cfl->GetValue("self-repair-scale", &self_repair_scale_); + cfl->GetValue("param-mean", ¶m_mean); + cfl->GetValue("param-stddev", ¶m_stddev); + cfl->GetValue("alpha", &alpha); + cfl->GetValue("rank", &rank); + cfl->GetValue("update-period", &update_period); + + + w_h_.Resize(cell_dim_); + w_h_.SetRandn(); + w_h_.Scale(param_stddev); + w_h_.Add(param_mean); + + preconditioner_.SetAlpha(alpha); + preconditioner_.SetRank(rank); + preconditioner_.SetUpdatePeriod(update_period); + + count_ = 0.0; + self_repair_total_ = 0.0; + value_sum_.Resize(cell_dim_); + deriv_sum_.Resize(cell_dim_); + + Check(); +} + +void* OutputGruNonlinearityComponent::Propagate( + const ComponentPrecomputedIndexes *indexes, + const CuMatrixBase &in, + CuMatrixBase *out) const { + KALDI_ASSERT(in.NumRows() == out->NumRows() && + in.NumCols() == InputDim() && + out->NumCols() == OutputDim()); + // This component implements the function + // (z_t, hpart_t, c_{t-1}) -> (h_t, c_t) + // of dimensions + // (cell_dim, cell_dim, cell_dim) -> (cell_dim, cell_dim), + // where: + // h_t = \tanh( hpart_t + W^h \dot c_{t-1} ) + // c_t = (1 - z_t) \dot h_t + z_t \dot c_{t-1}. + int32 num_rows = in.NumRows(), + c = cell_dim_; + CuSubMatrix z_t(in, 0, num_rows, 0, c), + hpart_t(in, 0, num_rows, c, c), + c_t1(in, 0, num_rows, c + c, c); + + CuSubMatrix h_t(*out, 0, num_rows, 0, c), + c_t(*out, 0, num_rows, c, c); + + h_t.CopyFromMat(c_t1); + // now h_t = c_{t-1} + h_t.MulColsVec(w_h_); + // now h_t = W^h \dot c_{t-1} + h_t.AddMat(1.0, hpart_t, kNoTrans); + // now h_t = hpart_t + W^h \dot c_{t-1}.(note: hpart_t actually means U^h x_t). + h_t.Tanh(h_t); + // now, h_t = tanh(hpart_t + W^h \dot c_{t-1}). + + c_t.CopyFromMat(h_t); + // now c_t = h_t + c_t.AddMatMatElements(-1.0, z_t, h_t, 1.0); + // now c_t = (1 - z_t) \dot h_t. + c_t.AddMatMatElements(1.0, z_t, c_t1, 1.0); + // now c_t = (1 - z_t) \dot h_t + z_t \dot c_{t-1}. + return NULL; +} + +void OutputGruNonlinearityComponent::Backprop( + const std::string &debug_info, + const ComponentPrecomputedIndexes *, // indexes + const CuMatrixBase &in_value, + const CuMatrixBase &out_value, + const CuMatrixBase &out_deriv, + void *memo, + Component *to_update_in, + CuMatrixBase *in_deriv) const { + KALDI_ASSERT(SameDim(out_value, out_deriv) && + in_value.NumRows() == out_value.NumRows() && + in_value.NumCols() == InputDim() && + out_value.NumCols() == OutputDim() && + (in_deriv == NULL || SameDim(in_value, *in_deriv)) && + memo == NULL); + OutputGruNonlinearityComponent *to_update = + dynamic_cast(to_update_in); + KALDI_ASSERT(in_deriv != NULL || to_update != NULL); + int32 num_rows = in_value.NumRows(), + c = cell_dim_; + + // To understand what's going on here, compare this code with the + // corresponding 'forward' code in Propagate(). + + + CuSubMatrix z_t(in_value, 0, num_rows, 0, c), + hpart_t(in_value, 0, num_rows, c, c), + c_t1(in_value, 0, num_rows, c + c, c); + + // The purpose of this 'in_deriv_ptr' is so that we can create submatrices + // like z_t_deriv without the code crashing. If in_deriv is NULL these point + // to 'in_value', and we'll be careful never to actually write to these + // sub-matrices, which aside from being conceptually wrong would violate the + // const semantics of this function. + const CuMatrixBase *in_deriv_ptr = + (in_deriv == NULL ? &in_value : in_deriv); + CuSubMatrix z_t_deriv(*in_deriv_ptr, 0, num_rows, 0, c), + hpart_t_deriv(*in_deriv_ptr, 0, num_rows, c, c), + c_t1_deriv(*in_deriv_ptr, 0, num_rows, c + c, c); + + // Note: the output h_t is never actually used in the GRU computation (we only + // output it because we want the value to be cached to save computation in the + // backprop), so we expect that the 'h_t_deriv', if we extracted it in the + // obvious way, would be all zeros. + // We create a different, local h_t_deriv + // variable that backpropagates the derivative from c_t_deriv. + CuSubMatrix h_t(out_value, 0, num_rows, 0, c), + c_t(out_value, 0, num_rows, c, c), + c_t_deriv(out_deriv, 0, num_rows, c, c); + CuMatrix h_t_deriv(num_rows, c, kUndefined); + + { // we initialize h_t_deriv with the derivative from 'out_deriv'. + // In real life in a GRU, this would always be zero; but in testing + // code it may be nonzero and we include this term so that + // the tests don't fail. Note: if you were to remove these + // lines, you'd have to change 'h_t_deriv.AddMat(1.0, c_t_deriv);' below + // to a CopyFromMat() call. + CuSubMatrix h_t_deriv_in(out_deriv, 0, num_rows, 0, c); + h_t_deriv.CopyFromMat(h_t_deriv_in); + } + + + { // This block does the + // backprop corresponding to the + // forward-pass expression: c_t = (1 - z_t) \dot h_t + z_t \dot c_{t-1}. + + // First do: h_t_deriv = c_t_deriv \dot (1 - z_t). + h_t_deriv.AddMat(1.0, c_t_deriv); + h_t_deriv.AddMatMatElements(-1.0, c_t_deriv, z_t, 1.0); + + if (in_deriv) { + // these should be self-explanatory if you study + // the expression "c_t = (1 - z_t) \dot h_t + z_t \dot c_{t-1}". + z_t_deriv.AddMatMatElements(-1.0, c_t_deriv, h_t, 1.0); + z_t_deriv.AddMatMatElements(1.0, c_t_deriv, c_t1, 1.0); + c_t1_deriv.AddMatMatElements(1.0, c_t_deriv, z_t, 1.0); + } + } + + h_t_deriv.DiffTanh(h_t, h_t_deriv); + if (to_update) + to_update->TanhStatsAndSelfRepair(h_t, &h_t_deriv); + + if (to_update) + to_update->UpdateParameters(c_t1, h_t_deriv); + // At this point, 'h_t_deriv' contains the derivative w.r.t. + // the argument of the tanh function, i.e. w.r.t. the expression: + // hpart_t + W^h \dot c_{t-1}. + // The next block propagates this to the derivative for h_part_t and c_t1 + // The derivative of z_t has already been finished. + if (in_deriv) { + hpart_t_deriv.AddMat(1.0, h_t_deriv); + + // Currently, c_t1_deriv contains the derivative from + // c_t = (1 - z_t) \dot h_t + z_t \dot c_{t-1} + // Now compute the h_t = \tanh(hpart_t + W^h \dot c_{t-1}) part + h_t_deriv.MulColsVec(w_h_); + // Combine the two parts + c_t1_deriv.AddMat(1.0, h_t_deriv); + } +} + + +void OutputGruNonlinearityComponent::TanhStatsAndSelfRepair( + const CuMatrixBase &h_t, + CuMatrixBase *h_t_deriv) { + KALDI_ASSERT(SameDim(h_t, *h_t_deriv)); + + // we use this probability (hardcoded for now) to limit the stats accumulation + // and self-repair code to running on about half of the minibatches. + BaseFloat repair_and_stats_probability = 0.5; + if (RandUniform() > repair_and_stats_probability) + return; + + // OK, accumulate stats. + // For the next few lines, compare with TanhComponent::StoreStats(), which is where + // we got this code. + // tanh_deriv is the function derivative of the tanh function, + // tanh'(x) = tanh(x) * (1.0 - tanh(x)). h_t corresponds to tanh(x). + CuMatrix tanh_deriv(h_t); + tanh_deriv.ApplyPow(2.0); + tanh_deriv.Scale(-1.0); + tanh_deriv.Add(1.0); + + count_ += h_t.NumRows(); + CuVector temp(cell_dim_); + temp.AddRowSumMat(1.0, h_t, 0.0); + value_sum_.AddVec(1.0, temp); + temp.AddRowSumMat(1.0, tanh_deriv, 0.0); + deriv_sum_.AddVec(1.0, temp); + + if (count_ <= 0.0) { + // this would be rather pathological if it happened. + return; + } + + // The rest of this function contains code modified from + // TanhComponent::RepairGradients(). + + // thresholds_vec is actually a 1-row matrix. (the ApplyHeaviside + // function isn't defined for vectors). + CuMatrix thresholds(1, cell_dim_); + CuSubVector thresholds_vec(thresholds, 0); + thresholds_vec.AddVec(-1.0, deriv_sum_); + thresholds_vec.Add(self_repair_threshold_ * count_); + thresholds.ApplyHeaviside(); + self_repair_total_ += thresholds_vec.Sum(); + + // there is a comment explaining what we are doing with + // 'thresholds_vec', at this point in TanhComponent::RepairGradients(). + // We won't repeat it here. + + h_t_deriv->AddMatDiagVec(-self_repair_scale_ / repair_and_stats_probability, + h_t, kNoTrans, thresholds_vec); +} + +void OutputGruNonlinearityComponent::UpdateParameters( + const CuMatrixBase &c_t1_value, + const CuMatrixBase &h_t_deriv) { + if (is_gradient_) { + // 'simple' update, no natural gradient. Compare + // with PerElementScaleComponent::UpdateSimple(). + w_h_.AddDiagMatMat(learning_rate_, h_t_deriv, kTrans, + c_t1_value, kNoTrans, 1.0); + } else { + // the natural-gradient update. + CuMatrix derivs_per_frame(c_t1_value); + derivs_per_frame.MulElements(h_t_deriv); + + // This "scale" value gets will get multiplied into the learning rate. + BaseFloat scale; + + preconditioner_.PreconditionDirections(&derivs_per_frame, &scale); + + CuVector delta_w_h(w_h_.Dim()); + delta_w_h.AddRowSumMat(scale * learning_rate_, derivs_per_frame); + w_h_.AddVec(1.0, delta_w_h); + } +} + + + +void OutputGruNonlinearityComponent::Read(std::istream &is, bool binary) { + ReadUpdatableCommon(is, binary); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &cell_dim_); + ExpectToken(is, binary, ""); + w_h_.Read(is, binary); + ExpectToken(is, binary, ""); + value_sum_.Read(is, binary); + ExpectToken(is, binary, ""); + deriv_sum_.Read(is, binary); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &self_repair_total_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &count_); + value_sum_.Scale(count_); // we read in the averages, not the sums. + deriv_sum_.Scale(count_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &self_repair_threshold_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &self_repair_scale_); + BaseFloat alpha; + int32 rank, update_period; + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &alpha); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &rank); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &update_period); + preconditioner_.SetRank(rank); + preconditioner_.SetAlpha(alpha); + preconditioner_.SetUpdatePeriod(update_period); + ExpectToken(is, binary, ""); +} + +void OutputGruNonlinearityComponent::Write(std::ostream &os, bool binary) const { + WriteUpdatableCommon(os, binary); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, cell_dim_); + WriteToken(os, binary, ""); + w_h_.Write(os, binary); + { + // Write the value and derivative stats in a count-normalized way, for + // greater readability in text form. + WriteToken(os, binary, ""); + Vector temp(value_sum_); + if (count_ != 0.0) temp.Scale(1.0 / count_); + temp.Write(os, binary); + WriteToken(os, binary, ""); + temp.CopyFromVec(deriv_sum_); + if (count_ != 0.0) temp.Scale(1.0 / count_); + temp.Write(os, binary); + } + WriteToken(os, binary, ""); + WriteBasicType(os, binary, self_repair_total_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, count_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, self_repair_threshold_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, self_repair_scale_); + + BaseFloat alpha = preconditioner_.GetAlpha(); + int32 rank = preconditioner_.GetRank(), + update_period = preconditioner_.GetUpdatePeriod(); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, alpha); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, rank); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, update_period); + WriteToken(os, binary, ""); +} + +void OutputGruNonlinearityComponent::Scale(BaseFloat scale) { + if (scale == 0.0) { + w_h_.SetZero(); + value_sum_.SetZero(); + deriv_sum_.SetZero(); + self_repair_total_ = 0.0; + count_ = 0.0; + } else { + w_h_.Scale(scale); + value_sum_.Scale(scale); + deriv_sum_.Scale(scale); + self_repair_total_ *= scale; + count_ *= scale; + } +} + +void OutputGruNonlinearityComponent::Add(BaseFloat alpha, + const Component &other_in) { + const OutputGruNonlinearityComponent *other = + dynamic_cast(&other_in); + KALDI_ASSERT(other != NULL); + w_h_.AddVec(alpha, other->w_h_); + value_sum_.AddVec(alpha, other->value_sum_); + deriv_sum_.AddVec(alpha, other->deriv_sum_); + self_repair_total_ += alpha * other->self_repair_total_; + count_ += alpha * other->count_; +} + +void OutputGruNonlinearityComponent::ZeroStats() { + value_sum_.SetZero(); + deriv_sum_.SetZero(); + self_repair_total_ = 0.0; + count_ = 0.0; +} + +void OutputGruNonlinearityComponent::Check() const { + KALDI_ASSERT(cell_dim_ > 0 && + self_repair_threshold_ >= 0.0 && + self_repair_scale_ >= 0.0 ); + KALDI_ASSERT(w_h_.Dim() == cell_dim_); + KALDI_ASSERT(value_sum_.Dim() == cell_dim_ && + deriv_sum_.Dim() == cell_dim_); +} + +void OutputGruNonlinearityComponent::PerturbParams(BaseFloat stddev) { + CuVector temp_params(w_h_.Dim()); + temp_params.SetRandn(); + w_h_.AddVec(stddev, temp_params); +} + +BaseFloat OutputGruNonlinearityComponent::DotProduct( + const UpdatableComponent &other_in) const { + const OutputGruNonlinearityComponent *other = + dynamic_cast(&other_in); + KALDI_ASSERT(other != NULL); + return VecVec(w_h_, other->w_h_); +} + +int32 OutputGruNonlinearityComponent::NumParameters() const { + return w_h_.Dim(); +} + +void OutputGruNonlinearityComponent::Vectorize(VectorBase *params) const { + KALDI_ASSERT(params->Dim() == NumParameters()); + params->CopyFromVec(w_h_); +} + + +void OutputGruNonlinearityComponent::UnVectorize( + const VectorBase ¶ms) { + KALDI_ASSERT(params.Dim() == NumParameters()); + w_h_.CopyFromVec(params); +} + +void OutputGruNonlinearityComponent::FreezeNaturalGradient(bool freeze) { + preconditioner_.Freeze(freeze); +} + +OutputGruNonlinearityComponent::OutputGruNonlinearityComponent( + const OutputGruNonlinearityComponent &other): + UpdatableComponent(other), + cell_dim_(other.cell_dim_), + w_h_(other.w_h_), + value_sum_(other.value_sum_), + deriv_sum_(other.deriv_sum_), + self_repair_total_(other.self_repair_total_), + count_(other.count_), + self_repair_threshold_(other.self_repair_threshold_), + self_repair_scale_(other.self_repair_scale_), + preconditioner_(other.preconditioner_) { + Check(); +} + +} // namespace nnet3 +} // namespace kaldi diff --git a/src/nnet3/nnet-combined-component.h b/src/nnet3/nnet-combined-component.h new file mode 100644 index 00000000000..85011bd826d --- /dev/null +++ b/src/nnet3/nnet-combined-component.h @@ -0,0 +1,1109 @@ +// nnet3/nnet-combined-component.h + +// Copyright 2018 Johns Hopkins University (author: Daniel Povey) +// 2018 Hang Lyu + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#ifndef KALDI_NNET3_NNET_SPECIAL_COMPONENT_H_ +#define KALDI_NNET3_NNET_SPECIAL_COMPONENT_H_ + +#include "nnet3/nnet-common.h" +#include "nnet3/nnet-component-itf.h" +#include "nnet3/natural-gradient-online.h" +#include + +namespace kaldi { +namespace nnet3 { + +/// @file nnet-combined-component.h +/// You can view this as an overflow from nnet-simple-component.h. +/// It contains components which meet the definition of "simple" +/// components, i.e. they set the kSimpleComponent flag, but +/// which are more special-purpose, i.e. they are specific to +/// special layer types such as LSTMs, CNNs and GRUs. + + + +/** + * WARNING, this component is deprecated in favor of + * TimeHeightConvolutionComponent, and will be deleted. + * ConvolutionalComponent implements 2d-convolution. + * It uses 3D filters on 3D inputs, but the 3D filters hop only over + * 2 dimensions as it has same size as the input along the 3rd dimension. + * Input : A matrix where each row is a vectorized 3D-tensor. + * The 3D tensor has dimensions + * x: (e.g. time) + * y: (e.g. frequency) + * z: (e.g. channels like features/delta/delta-delta) + * + * The component supports input vectorizations of type zyx and yzx. + * The default vectorization type is zyx. + * e.g. for input vectorization of type zyx the input is vectorized by + * spanning axes z, y and x of the tensor in that order. + * Given 3d tensor A with sizes (2, 2, 2) along the three dimensions + * the zyx vectorized input looks like + * A(0,0,0) A(0,0,1) A(0,1,0) A(0,1,1) A(1,0,0) A(1,0,1) A(1,1,0) A(1,1,1) + * + * + * Output : The output is also a 3D tensor vectorized in the zyx format. + * The channel axis (z) in the output corresponds to the output of + * different filters. The first channel corresponds to the first filter + * i.e., first row of the filter_params_ matrix. + * + * Note: The component has to support yzx input vectorization as the binaries + * like add-deltas generate yz vectorized output. These input vectors are + * concatenated using the Append descriptor across time steps to form a yzx + * vectorized 3D tensor input. + * e.g. Append(Offset(input, -1), input, Offset(input, 1)) + * + * + * For information on the hyperparameters and parameters of this component see + * the variable declarations. + * + * Propagation: + * ------------ + * Convolution operation consists of a dot-products between the filter tensor + * and input tensor patch, for various shifts of filter tensor along the x and y + * axes input tensor. (Note: there is no shift along z-axis as the filter and + * input tensor have same size along this axis). + * + * For a particular shift (i,j) of the filter tensor + * along input tensor dimensions x and y, the elements of the input tensor which + * overlap with the filter form the input tensor patch. This patch is vectorized + * in zyx format. All the patches corresponding to various samples in the + * mini-batch are stacked into a matrix, where each row corresponds to one + * patch. Let this matrix be represented by X_{i,j}. The dot products with + * various filters are computed simultaneously by computing the matrix product + * with the filter_params_ matrix (W) + * Y_{i,j} = X_{i,j}*W^T. + * Each row of W corresponds to one filter 3D tensor vectorized in zyx format. + * + * All the matrix products corresponding to various shifts (i,j) of the + * filter tensor are computed simultaneously using the AddMatMatBatched + * call of CuMatrixBase class. + * + * BackPropagation: + * ---------------- + * Backpropagation to compute the input derivative (\nabla X_{i,j}) + * consists of the a series of matrix products. + * \nablaX_{i,j} = \nablaY_{i,j}*W where \nablaY_{i,j} corresponds to the + * output derivative for a particular shift of the filter. + * + * Once again these matrix products are computed simultaneously. + * + * Update: + * ------- + * The weight gradient is computed as + * \nablaW = \Sum_{i,j} (X_{i,j}^T *\nablaY_{i,j}) + * + */ +class ConvolutionComponent: public UpdatableComponent { + public: + enum TensorVectorizationType { + kYzx = 0, + kZyx = 1 + }; + + ConvolutionComponent(); + // constructor using another component + ConvolutionComponent(const ConvolutionComponent &component); + // constructor using parameters + ConvolutionComponent( + const CuMatrixBase &filter_params, + const CuVectorBase &bias_params, + int32 input_x_dim, int32 input_y_dim, int32 input_z_dim, + int32 filt_x_dim, int32 filt_y_dim, + int32 filt_x_step, int32 filt_y_step, + TensorVectorizationType input_vectorization, + BaseFloat learning_rate); + + virtual int32 InputDim() const; + virtual int32 OutputDim() const; + + virtual std::string Info() const; + virtual void InitFromConfig(ConfigLine *cfl); + virtual std::string Type() const { return "ConvolutionComponent"; } + virtual int32 Properties() const { + return kSimpleComponent|kUpdatableComponent|kBackpropNeedsInput| + kBackpropAdds|kPropagateAdds; + } + + virtual void* Propagate(const ComponentPrecomputedIndexes *indexes, + const CuMatrixBase &in, + CuMatrixBase *out) const; + virtual void Backprop(const std::string &debug_info, + const ComponentPrecomputedIndexes *indexes, + const CuMatrixBase &in_value, + const CuMatrixBase &, // out_value, + const CuMatrixBase &out_deriv, + void *memo, + Component *to_update_in, + CuMatrixBase *in_deriv) const; + void Update(const std::string &debug_info, + const CuMatrixBase &in_value, + const CuMatrixBase &out_deriv, + const std::vector *>& out_deriv_batch); + + + virtual void Read(std::istream &is, bool binary); + virtual void Write(std::ostream &os, bool binary) const; + + virtual Component* Copy() const; + + // Some functions from base-class UpdatableComponent. + virtual void Scale(BaseFloat scale); + virtual void Add(BaseFloat alpha, const Component &other); + virtual void PerturbParams(BaseFloat stddev); + virtual BaseFloat DotProduct(const UpdatableComponent &other) const; + virtual int32 NumParameters() const; + virtual void Vectorize(VectorBase *params) const; + virtual void UnVectorize(const VectorBase ¶ms); + + // Some functions that are specific to this class. + void SetParams(const VectorBase &bias, + const MatrixBase &filter); + const CuVector &BiasParams() const { return bias_params_; } + const CuMatrix &LinearParams() const { return filter_params_; } + void Init(int32 input_x_dim, int32 input_y_dim, int32 input_z_dim, + int32 filt_x_dim, int32 filt_y_dim, + int32 filt_x_step, int32 filt_y_step, int32 num_filters, + TensorVectorizationType input_vectorization, + BaseFloat param_stddev, BaseFloat bias_stddev); + // there is no filt_z_dim parameter as the length of the filter along + // z-dimension is same as the input + void Init(int32 input_x_dim, int32 input_y_dim, int32 input_z_dim, + int32 filt_x_dim, int32 filt_y_dim, + int32 filt_x_step, int32 filt_y_step, + TensorVectorizationType input_vectorization, + std::string matrix_filename); + + // resize the component, setting the parameters to zero, while + // leaving any other configuration values the same + void Resize(int32 input_dim, int32 output_dim); + + void Update(const std::string &debug_info, + const CuMatrixBase &in_value, + const CuMatrixBase &out_deriv); + + + private: + int32 input_x_dim_; // size of the input along x-axis + // (e.g. number of time steps) + + int32 input_y_dim_; // size of input along y-axis + // (e.g. number of mel-frequency bins) + + int32 input_z_dim_; // size of input along z-axis + // (e.g. number of channels is 3 if the input has + // features + delta + delta-delta features + + int32 filt_x_dim_; // size of the filter along x-axis + + int32 filt_y_dim_; // size of the filter along y-axis + + // there is no filt_z_dim_ as it is always assumed to be + // the same as input_z_dim_ + + int32 filt_x_step_; // the number of steps taken along x-axis of input + // before computing the next dot-product + // of filter and input + + int32 filt_y_step_; // the number of steps taken along y-axis of input + // before computing the next dot-product of the filter + // and input + + // there is no filt_z_step_ as only dot product is possible along this axis + + TensorVectorizationType input_vectorization_; // type of vectorization of the + // input 3D tensor. Accepts zyx and yzx formats + + CuMatrix filter_params_; + // the filter (or kernel) matrix is a matrix of vectorized 3D filters + // where each row in the matrix corresponds to one filter. + // The 3D filter tensor is vectorizedin zyx format. + // The first row of the matrix corresponds to the first filter and so on. + // Keep in mind the vectorization type and order of filters when using file + // based initialization. + + CuVector bias_params_; + // the filter-specific bias vector (i.e., there is a seperate bias added + // to the output of each filter). + + void InputToInputPatches(const CuMatrixBase& in, + CuMatrix *patches) const; + void InderivPatchesToInderiv(const CuMatrix& in_deriv_patches, + CuMatrixBase *in_deriv) const; + const ConvolutionComponent &operator = (const ConvolutionComponent &other); // Disallow. +}; + + +/* + LstmNonlinearityComponent is a component that implements part of an LSTM, by + combining together the sigmoids and tanh's, plus some diagonal terms, into + a single block. + We will refer to the LSTM formulation used in + + Long Short-Term Memory Recurrent Neural Network Architectures for Large Scale Acoustic Modeling" + by H. Sak et al, + http://static.googleusercontent.com/media/research.google.com/en//pubs/archive/43905.pdf. + + Suppose the cell dimension is C. Then outside this component, we compute + the 4 * C-dimensional quantity consisting of 4 blocks as follows, by a single + matrix multiplication: + + i_part = W_{ix} x_t + W_{im} m_{t-1} + b_i + f_part = W_{fx} x_t + W_{fm} m_{t-1} + b_f + c_part = W_{cx} x_t + W_{cm} m_{t-1} + b_c + o_part = W_{cx} x_t + W_{om} m_{t-1} + b_o + + The part of the computation that takes place in this component is as follows. + Its input is of dimension 5C [however, search for 'dropout' below], + consisting of 5 blocks: (i_part, f_part, c_part, o_part, and c_{t-1}). Its + output is of dimension 2C, consisting of 2 blocks: c_t and m_t. + + To recap: the input is (i_part, f_part, c_part, o_part, c_{t-1}); the output is (c_t, m_t). + + This component has parameters, 3C of them in total: the diagonal matrices w_i, w_f + and w_o. + + + In the forward pass (Propagate), this component computes the following: + + i_t = Sigmoid(i_part + w_{ic}*c_{t-1}) (1) + f_t = Sigmoid(f_part + w_{fc}*c_{t-1}) (2) + c_t = f_t*c_{t-1} + i_t * Tanh(c_part) (3) + o_t = Sigmoid(o_part + w_{oc}*c_t) (4) + m_t = o_t * Tanh(c_t) (5) + # note: the outputs are just c_t and m_t. + + [Note regarding dropout: optionally the input-dimension may be 5C + 3 instead + of 5C in this case, the last three input dimensions will be interpreted as + per-frame dropout masks on i_t, f_t and o_t respectively, so that on the RHS of + (3), i_t is replaced by i_t * i_t_scale, and likewise for f_t and o_t.] + + The backprop is as you would think, but for the "self-repair" we need to pass + in additional vectors (of the same dim as the parameters of the layer) that + dictate whether or not we add an additional term to the backpropagated + derivatives. (This term helps force the input to the nonlinearities into the + range where the derivatives are not too small). + + This component stores stats of the same form as are normally stored by the + StoreStats() functions for the sigmoid and tanh units, i.e. averages of the + activations and derivatives, but this is done inside the Backprop() functions. + [the StoreStats() functions don't take the input data as an argument, so + storing this data that way is impossible, and anyway it's more efficient to + do it as part of backprop.] + + Configuration values accepted: + cell-dim e.g. cell-dim=1024 Cell dimension. The input + dimension of this component is cell-dim * 5, and the + output dimension is cell-dim * 2. Note: this + component implements only part of the LSTM layer, + see comments above. + param-stddev Standard deviation for random initialization of + the diagonal matrices (AKA peephole connections). + default=1.0, which is probably too high but + we couldn't see any reliable gain from decreasing it. + tanh-self-repair-threshold Equivalent to the self-repair-lower-threshold + in a TanhComponent; applies to both the tanh nonlinearities. + default=0.2, you probably won't want to changethis. + sigmoid-self-repair-threshold Equivalent to self-repair-lower-threshold + in a SigmoidComponent; applies to all three of the sigmoid + nonlinearities. default=0.05, you probably won't want to + change this. + self-repair-scale Equivalent to the self-repair-scale in a SigmoidComponent + or TanhComponent; applies to both the sigmoid and tanh + nonlinearities. default=1.0e-05, which you probably won't + want to change unless dealing with an objective function + that has smaller or larger dynamic range than normal, in + which case you might want to make it smaller or larger. +*/ +class LstmNonlinearityComponent: public UpdatableComponent { + public: + + virtual int32 InputDim() const; + virtual int32 OutputDim() const; + virtual std::string Info() const; + virtual void InitFromConfig(ConfigLine *cfl); + LstmNonlinearityComponent(): use_dropout_(false) { } + virtual std::string Type() const { return "LstmNonlinearityComponent"; } + virtual int32 Properties() const { + return kSimpleComponent|kUpdatableComponent|kBackpropNeedsInput; + } + + virtual void* Propagate(const ComponentPrecomputedIndexes *indexes, + const CuMatrixBase &in, + CuMatrixBase *out) const; + virtual void Backprop(const std::string &debug_info, + const ComponentPrecomputedIndexes *indexes, + const CuMatrixBase &in_value, + const CuMatrixBase &, // out_value, + const CuMatrixBase &out_deriv, + void *memo, + Component *to_update_in, + CuMatrixBase *in_deriv) const; + + virtual void Read(std::istream &is, bool binary); + virtual void Write(std::ostream &os, bool binary) const; + + virtual Component* Copy() const; + + // Some functions from base-class UpdatableComponent. + virtual void Scale(BaseFloat scale); + virtual void Add(BaseFloat alpha, const Component &other); + virtual void PerturbParams(BaseFloat stddev); + virtual BaseFloat DotProduct(const UpdatableComponent &other) const; + virtual int32 NumParameters() const; + virtual void Vectorize(VectorBase *params) const; + virtual void UnVectorize(const VectorBase ¶ms); + virtual void ZeroStats(); + virtual void FreezeNaturalGradient(bool freeze); + + // Some functions that are specific to this class: + explicit LstmNonlinearityComponent( + const LstmNonlinearityComponent &other); + + void Init(int32 cell_dim, bool use_dropout, + BaseFloat param_stddev, + BaseFloat tanh_self_repair_threshold, + BaseFloat sigmoid_self_repair_threshold, + BaseFloat self_repair_scale); + + virtual void ConsolidateMemory(); + + private: + + // Initializes the natural-gradient object with the configuration we + // use for this object, which for now is hardcoded at the C++ level. + void InitNaturalGradient(); + + // Notation: C is the cell dimension; it equals params_.NumCols(). + + // The dimension of the parameter matrix is (3 x C); + // it contains the 3 diagonal parameter matrices w_i, w_f and w_o. + CuMatrix params_; + + // If true, we expect an extra 3 dimensions on the input, for dropout masks + // for i_t and f_t. + bool use_dropout_; + + // Of dimension 5 * C, with a row for each of the Sigmoid/Tanh functions in + // equations (1) through (5), this is the sum of the values of the nonliearities + // (used for diagnostics only). It is comparable to value_sum_ vector + // in base-class NonlinearComponent. + CuMatrix value_sum_; + + // Of dimension 5 * C, with a row for each of the Sigmoid/Tanh functions in + // equations (1) through (5), this is the sum of the derivatives of the + // nonliearities (used for diagnostics and to control self-repair). It is + // comparable to the deriv_sum_ vector in base-class + // NonlinearComponent. + CuMatrix deriv_sum_; + + // This matrix has dimension 10. The contents are a block of 5 self-repair + // thresholds (typically "0.05 0.05 0.2 0.05 0.2"), then a block of 5 + // self-repair scales (typically all 0.00001). These are for each of the 5 + // nonlinearities in the LSTM component in turn (see comments in cu-math.h for + // more info). + CuVector self_repair_config_; + + // This matrix has dimension 5. For each of the 5 nonlinearities in the LSTM + // component (see comments in cu-math.h for more info), it contains the total, + // over all frames represented in count_, of the number of dimensions that + // were subject to self_repair. To get the self-repair proportion you should + // divide by (count_ times cell_dim_). + CuVector self_repair_total_; + + // The total count (number of frames) corresponding to the stats in value_sum_ + // and deriv_sum_. + double count_; + + // Preconditioner for the parameters of this component [operates in the space + // of dimension C]. + // The preconditioner stores its own configuration values; we write and read + // these, but not the preconditioner object itself. + OnlineNaturalGradient preconditioner_; + + const LstmNonlinearityComponent &operator + = (const LstmNonlinearityComponent &other); // Disallow. +}; + + + + +/* + * WARNING, this component is deprecated as it's not compatible with + * TimeHeightConvolutionComponent, and it will eventually be deleted. + * MaxPoolingComponent : + * Maxpooling component was firstly used in ConvNet for selecting an + * representative activation in an area. It inspired Maxout nonlinearity. + * Each output element of this component is the maximum of a block of + * input elements where the block has a 3D dimension (pool_x_size_, + * pool_y_size_, pool_z_size_). + * Blocks could overlap if the shift value on any axis is smaller + * than its corresponding pool size (e.g. pool_x_step_ < pool_x_size_). + * If the shift values are euqal to their pool size, there is no + * overlap; while if they all equal 1, the blocks overlap to + * the greatest possible extent. + * + * This component is designed to be used after a ConvolutionComponent + * so that the input matrix is propagated from a 2d-convolutional layer. + * This component implements 3d-maxpooling which performs + * max pooling along the three axes. + * Input : A matrix where each row is a vectorized 3D-tensor. + * The 3D tensor has dimensions + * x: (e.g. time) + * y: (e.g. frequency) + * z: (e.g. channels like number of filters in the ConvolutionComponent) + * + * The component assumes input vectorizations of type zyx + * which is the default output vectorization type of a ConvolutionComponent. + * e.g. for input vectorization of type zyx the input is vectorized by + * spanning axes z, y and x of the tensor in that order. + * Given 3d tensor A with sizes (2, 2, 2) along the three dimensions + * the zyx vectorized input looks like + * A(0,0,0) A(0,0,1) A(0,1,0) A(0,1,1) A(1,0,0) A(1,0,1) A(1,1,0) A(1,1,1) + * + * Output : The output is also a 3D tensor vectorized in the zyx format. + * + * For information on the hyperparameters and parameters of this component see + * the variable declarations. + * + * + */ +class MaxpoolingComponent: public Component { + public: + + MaxpoolingComponent(): input_x_dim_(0), input_y_dim_(0), input_z_dim_(0), + pool_x_size_(0), pool_y_size_(0), pool_z_size_(0), + pool_x_step_(0), pool_y_step_(0), pool_z_step_(0) { } + // constructor using another component + MaxpoolingComponent(const MaxpoolingComponent &component); + + virtual int32 InputDim() const; + virtual int32 OutputDim() const; + + virtual std::string Info() const; + virtual void InitFromConfig(ConfigLine *cfl); + virtual std::string Type() const { return "MaxpoolingComponent"; } + virtual int32 Properties() const { + return kSimpleComponent|kBackpropNeedsInput|kBackpropNeedsOutput| + kBackpropAdds; + } + + virtual void* Propagate(const ComponentPrecomputedIndexes *indexes, + const CuMatrixBase &in, + CuMatrixBase *out) const; + virtual void Backprop(const std::string &debug_info, + const ComponentPrecomputedIndexes *indexes, + const CuMatrixBase &in_value, + const CuMatrixBase &out_value, + const CuMatrixBase &out_deriv, + void *memo, + Component *, // to_update, + CuMatrixBase *in_deriv) const; + + virtual void Read(std::istream &is, bool binary); // This Read function + // requires that the Component has the correct type. + + /// Write component to stream + virtual void Write(std::ostream &os, bool binary) const; + virtual Component* Copy() const { return new MaxpoolingComponent(*this); } + + + protected: + void InputToInputPatches(const CuMatrixBase& in, + CuMatrix *patches) const; + void InderivPatchesToInderiv(const CuMatrix& in_deriv_patches, + CuMatrixBase *in_deriv) const; + virtual void Check() const; + + + int32 input_x_dim_; // size of the input along x-axis + // (e.g. number of time steps) + int32 input_y_dim_; // size of input along y-axis + // (e.g. number of mel-frequency bins) + int32 input_z_dim_; // size of input along z-axis + // (e.g. number of filters in the ConvolutionComponent) + + int32 pool_x_size_; // size of the pooling window along x-axis + int32 pool_y_size_; // size of the pooling window along y-axis + int32 pool_z_size_; // size of the pooling window along z-axis + + int32 pool_x_step_; // the number of steps taken along x-axis of input + // before computing the next pool + int32 pool_y_step_; // the number of steps taken along y-axis of input + // before computing the next pool + int32 pool_z_step_; // the number of steps taken along z-axis of input + // before computing the next pool + +}; + + +/** + GruNonlinearityComponent is a component that implements part of a + Gated Recurrent Unit (GRU). This is more efficient in time and + memory than stitching it together using more basic components. + For a brief summary of what this actually computes, search + for 'recap' below; the first part of this comment establishes + the context. + + This component supports two cases: the regular GRU + (as described in "Empirical Evaluation of + Gated Recurrent Neural Networks on Sequence Modeling", + https://arxiv.org/pdf/1412.3555.pdf), + and our "projected GRU" which takes ideas from the + paper we'll abbreviate as "LSTM based RNN architectures for LVCSR", + https://arxiv.org/pdf/1402.1128.pdf. + + Before describing what this component does, we'll establish + some notation for the GRU. + + First, the regular (non-projected) GRU. In order to unify the notation with + our "projected GRU", we'll use slightly different variable names. We'll also + ignore the bias terms for purposes of this exposition (let them be implicit). + + + Regular GRU: + + z_t = \sigmoid ( U^z x_t + W^z y_{t-1} ) # update gate, dim == cell_dim + r_t = \sigmoid ( U^r x_t + W^r y_{t-1} ) # reset gate, dim == cell_dim + h_t = \tanh ( U^h x_t + W^h ( y_{t-1} \dot r_t ) ) # dim == cell_dim + y_t = ( 1 - z_t ) \dot h_t + z_t \dot y_{t-1} # dim == cell_dim + + For the "projected GRU", the 'cell_dim x cell_dim' full-matrix expressions W^z + W^r and W^h that participate in the expressions for z_t, r_t and h_t are + replaced with skinny matrices of dimension 'cell_dim x recurrent_dim' + (where recurrent_dim < cell_dim) and the output is replaced by + a lower-dimension projection of the hidden state, of dimension + 'recurrent_dim + non_recurrent_dim < cell_dim', instead of the + full 'cell_dim'. We rename y_t to c_t (this name is inspired by LSTMs), and + we now let the output (still called y_t) be a projection of c_t. + s_t is a dimension range of the output y_t. Parameters of the + projected GRU: + cell_dim > 0 + recurrent_dim > 0 + non_recurrent_dim > 0 (where non_recurrent_dim + recurrent_dim < cell_dim). + + + Equations: + + z_t = \sigmoid ( U^z x_t + W^z s_{t-1} ) # update gate, dim(z_t) == cell_dim + r_t = \sigmoid ( U^r x_t + W^r s_{t-1} ) # reset gate, dim(r_t) == recurrent_dim + h_t = \tanh ( U^h x_t + W^h ( s_{t-1} \dot r_t ) ) # dim(h_t) == cell_dim + c_t = ( 1 - z_t ) \dot h_t + z_t \dot c_{t-1} # dim(c_t) == cell_dim + y_t = W^y c_t # dim(y_t) = recurrent_dim + non_recurrent_dim. This is + # the output of the GRU. + s_t = y_t[0:recurrent_dim-1] # dimension range of y_t, dim(s_t) = recurrent_dim. + + + Because we'll need it below, we define + hpart_t = U^h x_t + which is a subexpression of h_t. + + Our choice to make a "special" component for the projected GRU is to have + it be a function from + (z_t, r_t, hpart_t, c_{t-1}, s_{t-1}) -> (h_t, c_t) + That is, the input to the component is all those things on the LHS + appended together, and the output is the two things on the + RHS appended together. The dimensions are: + (cell_dim, recurrent_dim, cell_dim, cell_dim, recurrent_dim) -> (cell_dim, cell_dim). + The component computes the functions: + h_t = \tanh( hpart_t + W^h (s_{t-1} \dot r_t)) + c_t = (1 - z_t) \dot h_t + z_t \dot c_{t-1}. + + Notice that 'W^h' is the only parameter that lives inside the component. + + You might also notice that the output 'h_t' is never actually used + in any other part of the GRU, so the question arises: why is it + necessary to have it be an output of the component? This has to do with + saving computation: because h_t is an output, and we'll be defining + the kBackpropNeedsOutput flag, it is available in the backprop phase + and this helps us avoid some computation (otherwise we'd have to do + a redundant multiplication by W^h in the backprop phase that we already + did in the forward phase). We could have used the 'memo' mechanism to + do this, but this is undesirable because the use of a memo disables + 'update consolidation' in the backprop so we'd lose a little + speed there. + + In the case where it's a regular, not projected GRU, this component + is a function from + (z_t, r_t, hpart_t, y_{t-1}) -> (h_t, y_t) + We can actually do this with the same code as the projected-GRU code, + we just make sure that recurrent_dim == cell_dim, and the only structural + difference is that c_{t-1} and s_{t-1} become the same variable (y_{t-1}), + and we rename c_t to y_t. + + This component stores stats of the same form as are normally stored by the + StoreStats() functions for the sigmoid and tanh units, i.e. averages of the + activations and derivatives, but this is done inside the Backprop() functions. + + + The main configuration values that are accepted: + cell-dim e.g. cell-dim=1024 Cell dimension. + recurrent-dim e.g. recurrent-dim=256. If not specified, we assume + this is a non-projected GRU. + param-stddev Standard deviation for random initialization of + the matrix W^h. Defaults to 1.0 / sqrt(d) where + d is recurrent-dim if specified, else cell-dim. + self-repair-threshold Equivalent to the self-repair-lower-threshold + in a TanhComponent; applies to the tanh nonlinearity. + default=0.2, you probably won't want to change this. + self-repair-scale Equivalent to the self-repair-scale in a + TanhComponent; applies to the tanh nonlinearity. + default=1.0e-05, which you probably won't want to + change unless dealing with an objective function that + has smaller or larger dynamic range than normal, in + which case you might want to make it smaller or + larger. + + Values inherited from UpdatableComponent (see its declaration in + nnet-component-itf.h for details): + learning-rate + learning-rate-factor + max-change + + Natural-gradient related options are below; you won't normally have to + set these. + alpha Constant that determines how much we smooth the + Fisher-matrix estimates with the unit matrix. + Larger means more smoothing. default=4.0 + rank-in Rank used in low-rank-plus-unit estimate of Fisher + matrix in the input space. default=20. + rank-out Rank used in low-rank-plus-unit estimate of Fisher + matrix in the output-derivative space. default=80. + update-period Determines the period (in minibatches) with which + we update the Fisher-matrix estimates; + making this > 1 saves a little time in training. + default=4. + + + Recap of what this computes: + If recurrent-dim is specified, this component implements + the function + (z_t, r_t, hpart_t, c_{t-1}, s_{t-1}) -> (h_t, c_t) + of dims: + (cell_dim, recurrent_dim, cell_dim, cell_dim, recurrent_dim) -> (cell_dim, cell_dim). + where: + h_t = \tanh( hpart_t + W^h (s_{t-1} \dot r_t)) + c_t = (1 - z_t) \dot h_t + z_t \dot c_{t-1}. + If recurrent-dim is not specified, this component implements + the function + (z_t, r_t, hpart_t, y_{t-1}) -> (h_t, y_t) + of dimensions + (cell_dim, cell_dim, cell_dim, cell_dim) -> (cell_dim, cell_dim), + where: + h_t = \tanh( hpart_t + W^h (y_{t-1} \dot r_t)) + y_t = (1 - z_t) \dot h_t + z_t \dot y_{t-1}. +*/ +class GruNonlinearityComponent: public UpdatableComponent { + public: + + virtual int32 InputDim() const; + virtual int32 OutputDim() const; + virtual std::string Info() const; + virtual void InitFromConfig(ConfigLine *cfl); + GruNonlinearityComponent() { } + virtual std::string Type() const { return "GruNonlinearityComponent"; } + virtual int32 Properties() const { + return kSimpleComponent|kUpdatableComponent|kBackpropNeedsInput|\ + kBackpropNeedsOutput|kBackpropAdds; + } + virtual void* Propagate(const ComponentPrecomputedIndexes *indexes, + const CuMatrixBase &in, + CuMatrixBase *out) const; + virtual void Backprop(const std::string &debug_info, + const ComponentPrecomputedIndexes *indexes, + const CuMatrixBase &in_value, + const CuMatrixBase &, // out_value, + const CuMatrixBase &out_deriv, + void *memo, + Component *to_update_in, + CuMatrixBase *in_deriv) const; + + virtual void Read(std::istream &is, bool binary); + virtual void Write(std::ostream &os, bool binary) const; + + virtual Component* Copy() const { return new GruNonlinearityComponent(*this); } + + virtual void Scale(BaseFloat scale); + virtual void Add(BaseFloat alpha, const Component &other); + + // Some functions from base-class UpdatableComponent. + virtual void PerturbParams(BaseFloat stddev); + virtual BaseFloat DotProduct(const UpdatableComponent &other) const; + virtual int32 NumParameters() const; + virtual void Vectorize(VectorBase *params) const; + virtual void UnVectorize(const VectorBase ¶ms); + virtual void ZeroStats(); + virtual void FreezeNaturalGradient(bool freeze); + + // Some functions that are specific to this class: + explicit GruNonlinearityComponent( + const GruNonlinearityComponent &other); + + private: + + void Check() const; // checks dimensions, etc. + + /** + This function stores value and derivative stats for the tanh + nonlinearity that is a part of this component, and if needed + adds the small 'self-repair' term to 'h_t_deriv'. + @param [in] h_t The output of the tanh expression from the + forward pass. + @param [in,out] h_t_deriv To here will be added the small + self-repair term (this is a small value + that we use to push oversaturated neurons + back to the center). + This function has side effects on the class instance, specifically the + members value_sum_, deriv_sum, self_repair_total_, and count_. + */ + void TanhStatsAndSelfRepair(const CuMatrixBase &h_t, + CuMatrixBase *h_t_deriv); + + /* This function is responsible for updating the w_h_ matrix + (taking into account the learning rate). + @param [in] sdotr The value of the expression (s_{t-1} \dot r_t). + @param [in] h_t_deriv The derivative of the objective + function w.r.t. the argument of the tanh + function, i.e. w.r.t. the expression + "hpart_t + W^h (s_{t-1} \dot r_t)". + This function is concerned with the second + term as it affects the derivative w.r.t. W^h. + */ + void UpdateParameters(const CuMatrixBase &sdotr, + const CuMatrixBase &h_t_deriv); + + + int32 cell_dim_; // cell dimension, e.g. 1024. + int32 recurrent_dim_; // recurrent dimension, e.g. 256 for projected GRU; + // if it's the same as cell_dim it means we are + // implementing regular (non-projected) GRU + + + // The matrix W^h, of dimension cell_dim_ by recurrent_dim_. + // There is no bias term needed here because hpart_t comes from + // an affine component that has a bias. + CuMatrix w_h_; + + // Of dimension cell_dim_, this is comparable to the value_sum_ vector in + // class NonlinearComponent. It stores the sum of the tanh nonlinearity. + // Normalize by dividing by count_. + CuVector value_sum_; + + // Of dimension cell_dim_, this is comparable to the deriv_sum_ vector in + // class NonlinearComponent. It stores the sum of the function-derivative of + // the tanh nonlinearity. Normalize by dividing by count_. + CuVector deriv_sum_; + + // This is part of the stats (along with value_sum_, deriv_sum_, and count_); + // if you divide it by count_ it gives you the proportion of the time that an + // average dimension was subject to self-repair. + double self_repair_total_; + + // The total count (number of frames) corresponding to the stats in value_sum_, + // deriv_sum_, and self_repair_total_. + double count_; + + // A configuration parameter, this determines how saturated the derivative + // has to be for a particular dimension, before we activate self-repair. + // Default value is 0.2, the same as for TanhComponent. + BaseFloat self_repair_threshold_; + + // A configuration parameter, this determines the maximum absolute value of + // the extra term that we add to the input derivative of the tanh when doing + // self repair. The default value is 1.0e-05. + BaseFloat self_repair_scale_; + + // Preconditioner for the input space when updating w_h_ (has dimension + // recurrent_dim_ if use-natural-gradient was true, else not set up). + // The preconditioner stores its own configuration values; we write and read + // these, but not the preconditioner object itself. + OnlineNaturalGradient preconditioner_in_; + // Preconditioner for the output space when updating w_h_ (has dimension + // recurrent_dim_ if use-natural-gradient was true, else not set up). + + OnlineNaturalGradient preconditioner_out_; + + const GruNonlinearityComponent &operator + = (const GruNonlinearityComponent &other); // Disallow. +}; + + +/** + OutputGruNonlinearityComponent is a component that implements part of a + Output Gated Recurrent Unit (OGRU). Compare with the traditional GRU, it uses + output gate instead reset gate, and the formula of h_t will be different. + You can regard it as a variant of GRU. + This code is more efficient in time and memory than stitching it together + using more basic components. + For a brief summary of what this actually computes, search for 'recap' below; + the first part of this comment establishes the context. For more information + about GRU, please check the summary of GruNonlinearityComponent. + + Before describing what this component does, we'll establish + some notation for the OGRU. + + We use the same notation with previous GRU. We'll also + ignore the bias terms for purposes of this exposition (let them be implicit). + + + Regular OGRU: + + z_t = \sigmoid ( U^z x_t + W^z y_{t-1} ) # update gate, dim == cell_dim + o_t = \sigmoid ( U^o x_t + W^o y_{t-1} ) # output gate, dim == cell_dim + h_t = \tanh ( U^h x_t + W^h \dot c_{t-1} ) # dim == cell_dim + c_t = ( 1 - z_t ) \dot h_t + z_t \dot c_{t-1} # dim == cell_dim + y_t = ( c_t \dot o_t ) + + For the "projected OGRU", the 'cell_dim x cell_dim' full-matrix expressions W^z + W^o that participate in the expressions for z_t, o_t are + replaced with skinny matrices of dimension 'cell_dim x recurrent_dim' + (where recurrent_dim < cell_dim) and the output is replaced by + a lower-dimension projection of the hidden state, of dimension + 'recurrent_dim + non_recurrent_dim < cell_dim', instead of the + full 'cell_dim'. + s_t is a dimension range of the output y_t. Parameters of the + projected OGRU: + cell_dim > 0 + recurrent_dim > 0 + non_recurrent_dim > 0 (where non_recurrent_dim + recurrent_dim <= cell_dim). + + + Equations: + + z_t = \sigmoid ( U^z x_t + W^z s_{t-1} ) # update gate, dim(z_t) == cell_dim + o_t = \sigmoid ( U^o x_t + W^o s_{t-1} ) # output gate, dim(o_t) == cell_dim + h_t = \tanh ( U^h x_t + W^h \dot c_{t-1} ) # dim(h_t) == cell_dim + c_t = ( 1 - z_t ) \dot h_t + z_t \dot c_{t-1} # dim(c_t) == cell_dim + y_t = ( c_t \dot o_t) W^y # dim(y_t) = recurrent_dim + non_recurrent_dim. + # This is the output of the OGRU. + s_t = y_t[0:recurrent_dim-1] # dimension range of y_t, dim(s_t) = recurrent_dim. + + + Because we'll need it below, we define + hpart_t = U^h x_t + which is a subexpression of h_t. + + Our choice to make a "special" component for the projected OGRU is to have + it be a function from + (z_t, hpart_t, c_{t-1}) -> (h_t, c_t) + That is, the input to the component is all those things on the LHS + appended together, and the output is the two things on the + RHS appended together. The dimensions are: + (cell_dim, cell_dim, cell_dim) -> (cell_dim, cell_dim). + The component computes the functions: + h_t = \tanh ( U^h x_t + W^h \dot c_{t-1} ) + c_t = ( 1 - z_t ) \dot h_t + z_t \dot c_{t-1} + + Notice that 'W^h' is the only parameter that lives inside the component. + + You might also notice that the output 'h_t' is never actually used + in any other part of the GRU, so the question arises: why is it + necessary to have it be an output of the component? This has to do with + saving computation: because h_t is an output, and we'll be defining + the kBackpropNeedsOutput flag, it is available in the backprop phase + and this helps us avoid some computation (otherwise we'd have to do + a redundant multiplication by W^h in the backprop phase that we already + did in the forward phase). We could have used the 'memo' mechanism to + do this, but this is undesirable because the use of a memo disables + 'update consolidation' in the backprop so we'd lose a little + speed there. + + This component stores stats of the same form as are normally stored by the + StoreStats() functions for the sigmoid and tanh units, i.e. averages of the + activations and derivatives, but this is done inside the Backprop() functions. + + + The main configuration values that are accepted: + cell-dim e.g. cell-dim=1024 Cell dimension. + recurrent-dim e.g. recurrent-dim=256. If not specified, we assume + this is a non-projected GRU. + param-stddev Standard deviation for random initialization of + the matrix W^h. Defaults to 1.0 / sqrt(d) where + d is recurrent-dim if specified, else cell-dim. + self-repair-threshold Equivalent to the self-repair-lower-threshold + in a TanhComponent; applies to the tanh nonlinearity. + default=0.2, you probably won't want to change this. + self-repair-scale Equivalent to the self-repair-scale in a + TanhComponent; applies to the tanh nonlinearity. + default=1.0e-05, which you probably won't want to + change unless dealing with an objective function that + has smaller or larger dynamic range than normal, in + which case you might want to make it smaller or + larger. + + Values inherited from UpdatableComponent (see its declaration in + nnet-component-itf.h for details): + learning-rate + learning-rate-factor + max-change + + Natural-gradient related options are below; you won't normally have to + set these. + alpha Constant that determines how much we smooth the + Fisher-matrix estimates with the unit matrix. + Larger means more smoothing. default=4.0 + rank The rank of the correction to the unit matrix. + default=8. + update-period Determines the period (in minibatches) with which + we update the Fisher-matrix estimates; + making this > 1 saves a little time in training. + default=10. + + + Recap of what this computes: + This component implements the function + (z_t, hpart_t, c_{t-1}) -> (h_t, c_t) + of dimensions + (cell_dim, cell_dim, cell_dim) -> (cell_dim, cell_dim), + where: + h_t = \tanh( hpart_t + W^h \dot c_{t-1} ) + c_t = (1 - z_t) \dot h_t + z_t \dot c_{t-1}. +*/ +class OutputGruNonlinearityComponent: public UpdatableComponent { + public: + + virtual int32 InputDim() const; + virtual int32 OutputDim() const; + virtual std::string Info() const; + virtual void InitFromConfig(ConfigLine *cfl); + OutputGruNonlinearityComponent() { } + virtual std::string Type() const { return "OutputGruNonlinearityComponent"; } + virtual int32 Properties() const { + return kSimpleComponent|kUpdatableComponent|kBackpropNeedsInput|\ + kBackpropNeedsOutput|kBackpropAdds; + } + virtual void* Propagate(const ComponentPrecomputedIndexes *indexes, + const CuMatrixBase &in, + CuMatrixBase *out) const; + virtual void Backprop(const std::string &debug_info, + const ComponentPrecomputedIndexes *indexes, + const CuMatrixBase &in_value, + const CuMatrixBase &, // out_value, + const CuMatrixBase &out_deriv, + void *memo, + Component *to_update_in, + CuMatrixBase *in_deriv) const; + + virtual void Read(std::istream &is, bool binary); + virtual void Write(std::ostream &os, bool binary) const; + + virtual Component* Copy() const { return new OutputGruNonlinearityComponent(*this); } + + virtual void Scale(BaseFloat scale); + virtual void Add(BaseFloat alpha, const Component &other); + + // Some functions from base-class UpdatableComponent. + virtual void PerturbParams(BaseFloat stddev); + virtual BaseFloat DotProduct(const UpdatableComponent &other) const; + virtual int32 NumParameters() const; + virtual void Vectorize(VectorBase *params) const; + virtual void UnVectorize(const VectorBase ¶ms); + virtual void ZeroStats(); + virtual void FreezeNaturalGradient(bool freeze); + + // Some functions that are specific to this class: + explicit OutputGruNonlinearityComponent( + const OutputGruNonlinearityComponent &other); + + private: + + void Check() const; // checks dimensions, etc. + + /** + This function stores value and derivative stats for the tanh + nonlinearity that is a part of this component, and if needed + adds the small 'self-repair' term to 'h_t_deriv'. + @param [in] h_t The output of the tanh expression from the + forward pass. + @param [in,out] h_t_deriv To here will be added the small + self-repair term (this is a small value + that we use to push oversaturated neurons + back to the center). + This function has side effects on the class instance, specifically the + members value_sum_, deriv_sum, self_repair_total_, and count_. + */ + void TanhStatsAndSelfRepair(const CuMatrixBase &h_t, + CuMatrixBase *h_t_deriv); + + /* This function is responsible for updating the w_h_ matrix + (taking into account the learning rate). + @param [in] c_t1_value The value of c_{t-1}. + @param [in] h_t_deriv The derivative of the objective + function w.r.t. the argument of the tanh + function, i.e. w.r.t. the expression + "hpart_t + W^h \dot c_t1". + This function is concerned with the second + term as it affects the derivative w.r.t. W^h. + */ + void UpdateParameters(const CuMatrixBase &c_t1_value, + const CuMatrixBase &h_t_deriv); + + + int32 cell_dim_; // cell dimension, e.g. 1024. + + // The matrix W^h, of dimension cell_dim_ by recurrent_dim_. + // There is no bias term needed here because hpart_t comes from + // an affine component that has a bias. + CuVector w_h_; + + // Of dimension cell_dim_, this is comparable to the value_sum_ vector in + // class NonlinearComponent. It stores the sum of the tanh nonlinearity. + // Normalize by dividing by count_. + CuVector value_sum_; + + // Of dimension cell_dim_, this is comparable to the deriv_sum_ vector in + // class NonlinearComponent. It stores the sum of the function-derivative of + // the tanh nonlinearity. Normalize by dividing by count_. + CuVector deriv_sum_; + + // This is part of the stats (along with value_sum_, deriv_sum_, and count_); + // if you divide it by count_ it gives you the proportion of the time that an + // average dimension was subject to self-repair. + double self_repair_total_; + + // The total count (number of frames) corresponding to the stats in value_sum_, + // deriv_sum_, and self_repair_total_. + double count_; + + // A configuration parameter, this determines how saturated the derivative + // has to be for a particular dimension, before we activate self-repair. + // Default value is 0.2, the same as for TanhComponent. + BaseFloat self_repair_threshold_; + + // A configuration parameter, this determines the maximum absolute value of + // the extra term that we add to the input derivative of the tanh when doing + // self repair. The default value is 1.0e-05. + BaseFloat self_repair_scale_; + + // Unlike the GruNonlinearityComponent, there is only one dimension to + // consider as the parameters are a vector not a matrix, so we only need one + // preconditioner. + OnlineNaturalGradient preconditioner_; + + const OutputGruNonlinearityComponent &operator + = (const OutputGruNonlinearityComponent &other); // Disallow. +}; + + +} // namespace nnet3 +} // namespace kaldi + + +#endif diff --git a/src/nnet3/nnet-component-itf.cc b/src/nnet3/nnet-component-itf.cc index d2d325d22f1..1ff7daa01d1 100644 --- a/src/nnet3/nnet-component-itf.cc +++ b/src/nnet3/nnet-component-itf.cc @@ -23,6 +23,7 @@ #include #include "nnet3/nnet-component-itf.h" #include "nnet3/nnet-simple-component.h" +#include "nnet3/nnet-combined-component.h" #include "nnet3/nnet-normalize-component.h" #include "nnet3/nnet-general-component.h" #include "nnet3/nnet-convolutional-component.h" @@ -178,6 +179,10 @@ Component* Component::NewComponentOfType(const std::string &component_type) { ans = new RestrictedAttentionComponent(); } else if (component_type == "SumBlockComponent") { ans = new SumBlockComponent(); + } else if (component_type == "GruNonlinearityComponent") { + ans = new GruNonlinearityComponent(); + } else if (component_type == "OutputGruNonlinearityComponent") { + ans = new OutputGruNonlinearityComponent(); } else if (component_type == "ScaleAndOffsetComponent") { ans = new ScaleAndOffsetComponent(); } diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc index 7a5eb7017a3..e8c99494b06 100644 --- a/src/nnet3/nnet-simple-component.cc +++ b/src/nnet3/nnet-simple-component.cc @@ -1,6 +1,6 @@ // nnet3/nnet-simple-component.cc -// Copyright 2015 Johns Hopkins University (author: Daniel Povey) +// Copyright 2015-2017 Johns Hopkins University (author: Daniel Povey) // 2015 Xiaohui Zhang // 2015 Guoguo Chen // 2015 Daniel Galvez @@ -3942,939 +3942,6 @@ void NaturalGradientPerElementScaleComponent::ConsolidateMemory() { preconditioner_.Swap(&temp); } -// Constructors for the convolution component -ConvolutionComponent::ConvolutionComponent(): - UpdatableComponent(), - input_x_dim_(0), input_y_dim_(0), input_z_dim_(0), - filt_x_dim_(0), filt_y_dim_(0), - filt_x_step_(0), filt_y_step_(0), - input_vectorization_(kZyx) { } - -ConvolutionComponent::ConvolutionComponent( - const ConvolutionComponent &component): - UpdatableComponent(component), - input_x_dim_(component.input_x_dim_), - input_y_dim_(component.input_y_dim_), - input_z_dim_(component.input_z_dim_), - filt_x_dim_(component.filt_x_dim_), - filt_y_dim_(component.filt_y_dim_), - filt_x_step_(component.filt_x_step_), - filt_y_step_(component.filt_y_step_), - input_vectorization_(component.input_vectorization_), - filter_params_(component.filter_params_), - bias_params_(component.bias_params_) { } - -ConvolutionComponent::ConvolutionComponent( - const CuMatrixBase &filter_params, - const CuVectorBase &bias_params, - int32 input_x_dim, int32 input_y_dim, int32 input_z_dim, - int32 filt_x_dim, int32 filt_y_dim, - int32 filt_x_step, int32 filt_y_step, - TensorVectorizationType input_vectorization, - BaseFloat learning_rate): - input_x_dim_(input_x_dim), - input_y_dim_(input_y_dim), - input_z_dim_(input_z_dim), - filt_x_dim_(filt_x_dim), - filt_y_dim_(filt_y_dim), - filt_x_step_(filt_x_step), - filt_y_step_(filt_y_step), - input_vectorization_(input_vectorization), - filter_params_(filter_params), - bias_params_(bias_params){ - KALDI_ASSERT(filter_params.NumRows() == bias_params.Dim() && - bias_params.Dim() != 0); - KALDI_ASSERT(filter_params.NumCols() == filt_x_dim * filt_y_dim * input_z_dim); - SetUnderlyingLearningRate(learning_rate); - is_gradient_ = false; -} - -// aquire input dim -int32 ConvolutionComponent::InputDim() const { - return input_x_dim_ * input_y_dim_ * input_z_dim_; -} - -// aquire output dim -int32 ConvolutionComponent::OutputDim() const { - int32 num_x_steps = (1 + (input_x_dim_ - filt_x_dim_) / filt_x_step_); - int32 num_y_steps = (1 + (input_y_dim_ - filt_y_dim_) / filt_y_step_); - int32 num_filters = filter_params_.NumRows(); - return num_x_steps * num_y_steps * num_filters; -} - -// initialize the component using hyperparameters -void ConvolutionComponent::Init( - int32 input_x_dim, int32 input_y_dim, int32 input_z_dim, - int32 filt_x_dim, int32 filt_y_dim, - int32 filt_x_step, int32 filt_y_step, int32 num_filters, - TensorVectorizationType input_vectorization, - BaseFloat param_stddev, BaseFloat bias_stddev) { - input_x_dim_ = input_x_dim; - input_y_dim_ = input_y_dim; - input_z_dim_ = input_z_dim; - filt_x_dim_ = filt_x_dim; - filt_y_dim_ = filt_y_dim; - filt_x_step_ = filt_x_step; - filt_y_step_ = filt_y_step; - input_vectorization_ = input_vectorization; - KALDI_ASSERT((input_x_dim_ - filt_x_dim_) % filt_x_step_ == 0); - KALDI_ASSERT((input_y_dim_ - filt_y_dim_) % filt_y_step_ == 0); - int32 filter_dim = filt_x_dim_ * filt_y_dim_ * input_z_dim_; - filter_params_.Resize(num_filters, filter_dim); - bias_params_.Resize(num_filters); - KALDI_ASSERT(param_stddev >= 0.0 && bias_stddev >= 0.0); - filter_params_.SetRandn(); - filter_params_.Scale(param_stddev); - bias_params_.SetRandn(); - bias_params_.Scale(bias_stddev); -} - -// initialize the component using predefined matrix file -void ConvolutionComponent::Init( - int32 input_x_dim, int32 input_y_dim, int32 input_z_dim, - int32 filt_x_dim, int32 filt_y_dim, - int32 filt_x_step, int32 filt_y_step, - TensorVectorizationType input_vectorization, - std::string matrix_filename) { - input_x_dim_ = input_x_dim; - input_y_dim_ = input_y_dim; - input_z_dim_ = input_z_dim; - filt_x_dim_ = filt_x_dim; - filt_y_dim_ = filt_y_dim; - filt_x_step_ = filt_x_step; - filt_y_step_ = filt_y_step; - input_vectorization_ = input_vectorization; - CuMatrix mat; - ReadKaldiObject(matrix_filename, &mat); - int32 filter_dim = (filt_x_dim_ * filt_y_dim_ * input_z_dim_); - int32 num_filters = mat.NumRows(); - KALDI_ASSERT(mat.NumCols() == (filter_dim + 1)); - filter_params_.Resize(num_filters, filter_dim); - bias_params_.Resize(num_filters); - filter_params_.CopyFromMat(mat.Range(0, num_filters, 0, filter_dim)); - bias_params_.CopyColFromMat(mat, filter_dim); -} - -// display information about component -std::string ConvolutionComponent::Info() const { - std::ostringstream stream; - stream << UpdatableComponent::Info() - << ", input-x-dim=" << input_x_dim_ - << ", input-y-dim=" << input_y_dim_ - << ", input-z-dim=" << input_z_dim_ - << ", filt-x-dim=" << filt_x_dim_ - << ", filt-y-dim=" << filt_y_dim_ - << ", filt-x-step=" << filt_x_step_ - << ", filt-y-step=" << filt_y_step_ - << ", input-vectorization=" << input_vectorization_ - << ", num-filters=" << filter_params_.NumRows(); - PrintParameterStats(stream, "filter-params", filter_params_); - PrintParameterStats(stream, "bias-params", bias_params_, true); - return stream.str(); -} - -// initialize the component using configuration file -void ConvolutionComponent::InitFromConfig(ConfigLine *cfl) { - bool ok = true; - std::string matrix_filename; - int32 input_x_dim = -1, input_y_dim = -1, input_z_dim = -1, - filt_x_dim = -1, filt_y_dim = -1, - filt_x_step = -1, filt_y_step = -1, - num_filters = -1; - std::string input_vectorization_order = "zyx"; - InitLearningRatesFromConfig(cfl); - ok = ok && cfl->GetValue("input-x-dim", &input_x_dim); - ok = ok && cfl->GetValue("input-y-dim", &input_y_dim); - ok = ok && cfl->GetValue("input-z-dim", &input_z_dim); - ok = ok && cfl->GetValue("filt-x-dim", &filt_x_dim); - ok = ok && cfl->GetValue("filt-y-dim", &filt_y_dim); - ok = ok && cfl->GetValue("filt-x-step", &filt_x_step); - ok = ok && cfl->GetValue("filt-y-step", &filt_y_step); - - if (!ok) - KALDI_ERR << "Bad initializer " << cfl->WholeLine(); - // optional argument - TensorVectorizationType input_vectorization; - cfl->GetValue("input-vectorization-order", &input_vectorization_order); - if (input_vectorization_order.compare("zyx") == 0) { - input_vectorization = kZyx; - } else if (input_vectorization_order.compare("yzx") == 0) { - input_vectorization = kYzx; - } else { - KALDI_ERR << "Unknown or unsupported input vectorization order " - << input_vectorization_order - << " accepted candidates are 'yzx' and 'zyx'"; - } - - if (cfl->GetValue("matrix", &matrix_filename)) { - // initialize from prefined parameter matrix - Init(input_x_dim, input_y_dim, input_z_dim, - filt_x_dim, filt_y_dim, - filt_x_step, filt_y_step, - input_vectorization, - matrix_filename); - } else { - ok = ok && cfl->GetValue("num-filters", &num_filters); - if (!ok) - KALDI_ERR << "Bad initializer " << cfl->WholeLine(); - // initialize from configuration - int32 filter_input_dim = filt_x_dim * filt_y_dim * input_z_dim; - BaseFloat param_stddev = 1.0 / std::sqrt(filter_input_dim), bias_stddev = 1.0; - cfl->GetValue("param-stddev", ¶m_stddev); - cfl->GetValue("bias-stddev", &bias_stddev); - Init(input_x_dim, input_y_dim, input_z_dim, - filt_x_dim, filt_y_dim, filt_x_step, filt_y_step, num_filters, - input_vectorization, param_stddev, bias_stddev); - } - if (cfl->HasUnusedValues()) - KALDI_ERR << "Could not process these elements in initializer: " - << cfl->UnusedValues(); - if (!ok) - KALDI_ERR << "Bad initializer " << cfl->WholeLine(); -} - -// Inline methods to convert from tensor index i.e., (x,y,z) index -// to index in yzx or zyx vectorized tensors -inline int32 YzxVectorIndex(int32 x, int32 y, int32 z, - int32 input_x_dim, - int32 input_y_dim, - int32 input_z_dim) { - KALDI_PARANOID_ASSERT(x < input_x_dim && y < input_y_dim && z < input_z_dim); - return (input_y_dim * input_z_dim) * x + (input_y_dim) * z + y; -} - -inline int32 ZyxVectorIndex(int32 x, int32 y, int32 z, - int32 input_x_dim, - int32 input_y_dim, - int32 input_z_dim) { - KALDI_PARANOID_ASSERT(x < input_x_dim && y < input_y_dim && z < input_z_dim); - return (input_y_dim * input_z_dim) * x + (input_z_dim) * y + z; -} - -// Method to convert from a matrix representing a minibatch of vectorized -// 3D tensors to patches for convolution, each patch corresponds to -// one dot product in the convolution -void ConvolutionComponent::InputToInputPatches( - const CuMatrixBase& in, - CuMatrix *patches) const{ - int32 num_x_steps = (1 + (input_x_dim_ - filt_x_dim_) / filt_x_step_); - int32 num_y_steps = (1 + (input_y_dim_ - filt_y_dim_) / filt_y_step_); - const int32 filt_x_step = filt_x_step_, - filt_y_step = filt_y_step_, - filt_x_dim = filt_x_dim_, - filt_y_dim = filt_y_dim_, - input_x_dim = input_x_dim_, - input_y_dim = input_y_dim_, - input_z_dim = input_z_dim_, - filter_dim = filter_params_.NumCols(); - - std::vector column_map(patches->NumCols()); - int32 column_map_size = column_map.size(); - for (int32 x_step = 0; x_step < num_x_steps; x_step++) { - for (int32 y_step = 0; y_step < num_y_steps; y_step++) { - int32 patch_number = x_step * num_y_steps + y_step; - int32 patch_start_index = patch_number * filter_dim; - for (int32 x = 0, index = patch_start_index; x < filt_x_dim; x++) { - for (int32 y = 0; y < filt_y_dim; y++) { - for (int32 z = 0; z < input_z_dim; z++, index++) { - KALDI_ASSERT(index < column_map_size); - if (input_vectorization_ == kZyx) { - column_map[index] = ZyxVectorIndex(x_step * filt_x_step + x, - y_step * filt_y_step + y, z, - input_x_dim, input_y_dim, - input_z_dim); - } else if (input_vectorization_ == kYzx) { - column_map[index] = YzxVectorIndex(x_step * filt_x_step + x, - y_step * filt_y_step + y, z, - input_x_dim, input_y_dim, - input_z_dim); - } - } - } - } - } - } - CuArray cu_cols(column_map); - patches->CopyCols(in, cu_cols); -} - - -// propagation function -// see function declaration in nnet-simple-component.h for details -void* ConvolutionComponent::Propagate(const ComponentPrecomputedIndexes *indexes, - const CuMatrixBase &in, - CuMatrixBase *out) const { - const int32 num_x_steps = (1 + (input_x_dim_ - filt_x_dim_) / filt_x_step_), - num_y_steps = (1 + (input_y_dim_ - filt_y_dim_) / filt_y_step_), - num_filters = filter_params_.NumRows(), - num_frames = in.NumRows(), - filter_dim = filter_params_.NumCols(); - KALDI_ASSERT((*out).NumRows() == num_frames && - (*out).NumCols() == (num_filters * num_x_steps * num_y_steps)); - - CuMatrix patches(num_frames, - num_x_steps * num_y_steps * filter_dim, - kUndefined); - InputToInputPatches(in, &patches); - CuSubMatrix* filter_params_elem = new CuSubMatrix( - filter_params_, 0, filter_params_.NumRows(), 0, filter_params_.NumCols()); - std::vector* > tgt_batch, patch_batch, - filter_params_batch; - - for (int32 x_step = 0; x_step < num_x_steps; x_step++) { - for (int32 y_step = 0; y_step < num_y_steps; y_step++) { - int32 patch_number = x_step * num_y_steps + y_step; - tgt_batch.push_back(new CuSubMatrix( - out->ColRange(patch_number * num_filters, num_filters))); - patch_batch.push_back(new CuSubMatrix( - patches.ColRange(patch_number * filter_dim, filter_dim))); - filter_params_batch.push_back(filter_params_elem); - tgt_batch[patch_number]->AddVecToRows(1.0, bias_params_, 1.0); // add bias - } - } - // apply all filters - AddMatMatBatched(1.0, tgt_batch, patch_batch, - kNoTrans, filter_params_batch, - kTrans, 1.0); - // release memory - delete filter_params_elem; - for (int32 p = 0; p < tgt_batch.size(); p++) { - delete tgt_batch[p]; - delete patch_batch[p]; - } - return NULL; -} - -// scale the parameters -void ConvolutionComponent::Scale(BaseFloat scale) { - if (scale == 0.0) { - filter_params_.SetZero(); - bias_params_.SetZero(); - } else { - filter_params_.Scale(scale); - bias_params_.Scale(scale); - } -} - -// add another convolution component -void ConvolutionComponent::Add(BaseFloat alpha, const Component &other_in) { - const ConvolutionComponent *other = - dynamic_cast(&other_in); - KALDI_ASSERT(other != NULL); - filter_params_.AddMat(alpha, other->filter_params_); - bias_params_.AddVec(alpha, other->bias_params_); -} - -/* - This function transforms a vector of lists into a list of vectors, - padded with -1. - @param[in] The input vector of lists. Let in.size() be D, and let - the longest list length (i.e. the max of in[i].size()) be L. - @param[out] The output list of vectors. The length of the list will - be L, each vector-dimension will be D (i.e. out[i].size() == D), - and if in[i] == j, then for some k we will have that - out[k][j] = i. The output vectors are padded with -1 - where necessary if not all the input lists have the same side. -*/ -void RearrangeIndexes(const std::vector > &in, - std::vector > *out) { - int32 D = in.size(); - int32 L = 0; - for (int32 i = 0; i < D; i++) - if (in[i].size() > L) - L = in[i].size(); - out->resize(L); - for (int32 i = 0; i < L; i++) - (*out)[i].resize(D, -1); - for (int32 i = 0; i < D; i++) { - for (int32 j = 0; j < in[i].size(); j++) { - (*out)[j][i] = in[i][j]; - } - } -} - -// Method to compute the input derivative matrix from the input derivatives -// for patches, where each patch corresponds to one dot product -// in the convolution -void ConvolutionComponent::InderivPatchesToInderiv( - const CuMatrix& in_deriv_patches, - CuMatrixBase *in_deriv) const { - - const int32 num_x_steps = (1 + (input_x_dim_ - filt_x_dim_) / filt_x_step_), - num_y_steps = (1 + (input_y_dim_ - filt_y_dim_) / filt_y_step_), - filt_x_step = filt_x_step_, - filt_y_step = filt_y_step_, - filt_x_dim = filt_x_dim_, - filt_y_dim = filt_y_dim_, - input_x_dim = input_x_dim_, - input_y_dim = input_y_dim_, - input_z_dim = input_z_dim_, - filter_dim = filter_params_.NumCols(); - - // Compute the reverse column_map from the matrix with input - // derivative patches to input derivative matrix - std::vector > reverse_column_map(in_deriv->NumCols()); - int32 rev_col_map_size = reverse_column_map.size(); - for (int32 x_step = 0; x_step < num_x_steps; x_step++) { - for (int32 y_step = 0; y_step < num_y_steps; y_step++) { - int32 patch_number = x_step * num_y_steps + y_step; - int32 patch_start_index = patch_number * filter_dim; - for (int32 x = 0, index = patch_start_index; x < filt_x_dim; x++) { - for (int32 y = 0; y < filt_y_dim; y++) { - for (int32 z = 0; z < input_z_dim; z++, index++) { - int32 vector_index; - if (input_vectorization_ == kZyx) { - vector_index = ZyxVectorIndex(x_step * filt_x_step + x, - y_step * filt_y_step + y, z, - input_x_dim, input_y_dim, - input_z_dim); - } else { - KALDI_ASSERT(input_vectorization_ == kYzx); - vector_index = YzxVectorIndex(x_step * filt_x_step + x, - y_step * filt_y_step + y, z, - input_x_dim, input_y_dim, - input_z_dim); - } - KALDI_ASSERT(vector_index < rev_col_map_size); - reverse_column_map[vector_index].push_back(index); - } - } - } - } - } - std::vector > rearranged_column_map; - RearrangeIndexes(reverse_column_map, &rearranged_column_map); - for (int32 p = 0; p < rearranged_column_map.size(); p++) { - CuArray cu_cols(rearranged_column_map[p]); - in_deriv->AddCols(in_deriv_patches, cu_cols); - } -} - -// back propagation function -// see function declaration in nnet-simple-component.h for details -void ConvolutionComponent::Backprop(const std::string &debug_info, - const ComponentPrecomputedIndexes *indexes, - const CuMatrixBase &in_value, - const CuMatrixBase &, // out_value, - const CuMatrixBase &out_deriv, - void *memo, - Component *to_update_in, - CuMatrixBase *in_deriv) const { - ConvolutionComponent *to_update = - dynamic_cast(to_update_in); - const int32 num_x_steps = (1 + (input_x_dim_ - filt_x_dim_) / filt_x_step_), - num_y_steps = (1 + (input_y_dim_ - filt_y_dim_) / filt_y_step_), - num_filters = filter_params_.NumRows(), - num_frames = out_deriv.NumRows(), - filter_dim = filter_params_.NumCols(); - - KALDI_ASSERT(out_deriv.NumRows() == num_frames && - out_deriv.NumCols() == - (num_filters * num_x_steps * num_y_steps)); - - // Compute inderiv patches - CuMatrix in_deriv_patches(num_frames, - num_x_steps * num_y_steps * filter_dim, - kSetZero); - - std::vector* > patch_deriv_batch, out_deriv_batch, - filter_params_batch; - CuSubMatrix* filter_params_elem = new CuSubMatrix( - filter_params_, 0, filter_params_.NumRows(), 0, filter_params_.NumCols()); - - for (int32 x_step = 0; x_step < num_x_steps; x_step++) { - for (int32 y_step = 0; y_step < num_y_steps; y_step++) { - int32 patch_number = x_step * num_y_steps + y_step; - - patch_deriv_batch.push_back(new CuSubMatrix( - in_deriv_patches.ColRange( - patch_number * filter_dim, filter_dim))); - out_deriv_batch.push_back(new CuSubMatrix(out_deriv.ColRange( - patch_number * num_filters, num_filters))); - filter_params_batch.push_back(filter_params_elem); - } - } - AddMatMatBatched(1.0, patch_deriv_batch, - out_deriv_batch, kNoTrans, - filter_params_batch, kNoTrans, 0.0); - - if (in_deriv) { - // combine the derivatives from the individual input deriv patches - // to compute input deriv matrix - InderivPatchesToInderiv(in_deriv_patches, in_deriv); - } - - if (to_update != NULL) { - to_update->Update(debug_info, in_value, out_deriv, out_deriv_batch); - } - - // release memory - delete filter_params_elem; - for (int32 p = 0; p < patch_deriv_batch.size(); p++) { - delete patch_deriv_batch[p]; - delete out_deriv_batch[p]; - } -} - - -// update parameters -// see function declaration in nnet-simple-component.h for details -void ConvolutionComponent::Update(const std::string &debug_info, - const CuMatrixBase &in_value, - const CuMatrixBase &out_deriv, - const std::vector *>& out_deriv_batch) { - // useful dims - const int32 num_x_steps = (1 + (input_x_dim_ - filt_x_dim_) / filt_x_step_), - num_y_steps = (1 + (input_y_dim_ - filt_y_dim_) / filt_y_step_), - num_filters = filter_params_.NumRows(), - num_frames = out_deriv.NumRows(), - filter_dim = filter_params_.NumCols(); - KALDI_ASSERT(out_deriv.NumRows() == num_frames && - out_deriv.NumCols() == - (num_filters * num_x_steps * num_y_steps)); - - - CuMatrix filters_grad; - CuVector bias_grad; - - CuMatrix input_patches(num_frames, - filter_dim * num_x_steps * num_y_steps, - kUndefined); - InputToInputPatches(in_value, &input_patches); - - filters_grad.Resize(num_filters, filter_dim, kSetZero); // reset - bias_grad.Resize(num_filters, kSetZero); // reset - - // create a single large matrix holding the smaller matrices - // from the vector container filters_grad_batch along the rows - CuMatrix filters_grad_blocks_batch( - num_x_steps * num_y_steps * filters_grad.NumRows(), - filters_grad.NumCols()); - - std::vector* > filters_grad_batch, input_patch_batch; - - for (int32 x_step = 0; x_step < num_x_steps; x_step++) { - for (int32 y_step = 0; y_step < num_y_steps; y_step++) { - int32 patch_number = x_step * num_y_steps + y_step; - filters_grad_batch.push_back(new CuSubMatrix( - filters_grad_blocks_batch.RowRange( - patch_number * filters_grad.NumRows(), filters_grad.NumRows()))); - - input_patch_batch.push_back(new CuSubMatrix( - input_patches.ColRange(patch_number * filter_dim, filter_dim))); - } - } - - AddMatMatBatched(1.0, filters_grad_batch, out_deriv_batch, kTrans, - input_patch_batch, kNoTrans, 1.0); - - // add the row blocks together to filters_grad - filters_grad.AddMatBlocks(1.0, filters_grad_blocks_batch); - - // create a matrix holding the col blocks sum of out_deriv - CuMatrix out_deriv_col_blocks_sum(out_deriv.NumRows(), - num_filters); - - // add the col blocks together to out_deriv_col_blocks_sum - out_deriv_col_blocks_sum.AddMatBlocks(1.0, out_deriv); - - bias_grad.AddRowSumMat(1.0, out_deriv_col_blocks_sum, 1.0); - - // release memory - for (int32 p = 0; p < input_patch_batch.size(); p++) { - delete filters_grad_batch[p]; - delete input_patch_batch[p]; - } - - // - // update - // - filter_params_.AddMat(learning_rate_, filters_grad); - bias_params_.AddVec(learning_rate_, bias_grad); -} - -void ConvolutionComponent::Read(std::istream &is, bool binary) { - ReadUpdatableCommon(is, binary); // Read opening tag and learning rate. - ExpectToken(is, binary, ""); - ReadBasicType(is, binary, &input_x_dim_); - ExpectToken(is, binary, ""); - ReadBasicType(is, binary, &input_y_dim_); - ExpectToken(is, binary, ""); - ReadBasicType(is, binary, &input_z_dim_); - ExpectToken(is, binary, ""); - ReadBasicType(is, binary, &filt_x_dim_); - ExpectToken(is, binary, ""); - ReadBasicType(is, binary, &filt_y_dim_); - ExpectToken(is, binary, ""); - ReadBasicType(is, binary, &filt_x_step_); - ExpectToken(is, binary, ""); - ReadBasicType(is, binary, &filt_y_step_); - ExpectToken(is, binary, ""); - int32 input_vectorization; - ReadBasicType(is, binary, &input_vectorization); - input_vectorization_ = static_cast(input_vectorization); - ExpectToken(is, binary, ""); - filter_params_.Read(is, binary); - ExpectToken(is, binary, ""); - bias_params_.Read(is, binary); - std::string tok; - ReadToken(is, binary, &tok); - if (tok == "") { - ReadBasicType(is, binary, &is_gradient_); - ExpectToken(is, binary, ""); - } else { - is_gradient_ = false; - KALDI_ASSERT(tok == ""); - } -} - -void ConvolutionComponent::Write(std::ostream &os, bool binary) const { - WriteUpdatableCommon(os, binary); // write opening tag and learning rate. - WriteToken(os, binary, ""); - WriteBasicType(os, binary, input_x_dim_); - WriteToken(os, binary, ""); - WriteBasicType(os, binary, input_y_dim_); - WriteToken(os, binary, ""); - WriteBasicType(os, binary, input_z_dim_); - WriteToken(os, binary, ""); - WriteBasicType(os, binary, filt_x_dim_); - WriteToken(os, binary, ""); - WriteBasicType(os, binary, filt_y_dim_); - WriteToken(os, binary, ""); - WriteBasicType(os, binary, filt_x_step_); - WriteToken(os, binary, ""); - WriteBasicType(os, binary, filt_y_step_); - WriteToken(os, binary, ""); - WriteBasicType(os, binary, static_cast(input_vectorization_)); - WriteToken(os, binary, ""); - filter_params_.Write(os, binary); - WriteToken(os, binary, ""); - bias_params_.Write(os, binary); - WriteToken(os, binary, ""); - WriteBasicType(os, binary, is_gradient_); - WriteToken(os, binary, ""); -} - -BaseFloat ConvolutionComponent::DotProduct(const UpdatableComponent &other_in) const { - const ConvolutionComponent *other = - dynamic_cast(&other_in); - return TraceMatMat(filter_params_, other->filter_params_, kTrans) - + VecVec(bias_params_, other->bias_params_); -} - -Component* ConvolutionComponent::Copy() const { - ConvolutionComponent *ans = new ConvolutionComponent(*this); - return ans; -} - -void ConvolutionComponent::PerturbParams(BaseFloat stddev) { - CuMatrix temp_filter_params(filter_params_); - temp_filter_params.SetRandn(); - filter_params_.AddMat(stddev, temp_filter_params); - - CuVector temp_bias_params(bias_params_); - temp_bias_params.SetRandn(); - bias_params_.AddVec(stddev, temp_bias_params); -} - -void ConvolutionComponent::SetParams(const VectorBase &bias, - const MatrixBase &filter) { - bias_params_ = bias; - filter_params_ = filter; - KALDI_ASSERT(bias_params_.Dim() == filter_params_.NumRows()); -} - -int32 ConvolutionComponent::NumParameters() const { - return (filter_params_.NumCols() + 1) * filter_params_.NumRows(); -} - -void ConvolutionComponent::Vectorize(VectorBase *params) const { - KALDI_ASSERT(params->Dim() == this->NumParameters()); - int32 num_filter_params = filter_params_.NumCols() * filter_params_.NumRows(); - params->Range(0, num_filter_params).CopyRowsFromMat(filter_params_); - params->Range(num_filter_params, bias_params_.Dim()).CopyFromVec(bias_params_); -} -void ConvolutionComponent::UnVectorize(const VectorBase ¶ms) { - KALDI_ASSERT(params.Dim() == this->NumParameters()); - int32 num_filter_params = filter_params_.NumCols() * filter_params_.NumRows(); - filter_params_.CopyRowsFromVec(params.Range(0, num_filter_params)); - bias_params_.CopyFromVec(params.Range(num_filter_params, bias_params_.Dim())); -} - -// aquire input dim -int32 MaxpoolingComponent::InputDim() const { - return input_x_dim_ * input_y_dim_ * input_z_dim_; -} - -MaxpoolingComponent::MaxpoolingComponent( - const MaxpoolingComponent &component): - input_x_dim_(component.input_x_dim_), - input_y_dim_(component.input_y_dim_), - input_z_dim_(component.input_z_dim_), - pool_x_size_(component.pool_x_size_), - pool_y_size_(component.pool_y_size_), - pool_z_size_(component.pool_z_size_), - pool_x_step_(component.pool_x_step_), - pool_y_step_(component.pool_y_step_), - pool_z_step_(component.pool_z_step_) { } - -// aquire output dim -int32 MaxpoolingComponent::OutputDim() const { - int32 num_pools_x = 1 + (input_x_dim_ - pool_x_size_) / pool_x_step_; - int32 num_pools_y = 1 + (input_y_dim_ - pool_y_size_) / pool_y_step_; - int32 num_pools_z = 1 + (input_z_dim_ - pool_z_size_) / pool_z_step_; - return num_pools_x * num_pools_y * num_pools_z; -} - -// check the component parameters -void MaxpoolingComponent::Check() const { - // sanity check of the max pooling parameters - KALDI_ASSERT(input_x_dim_ > 0); - KALDI_ASSERT(input_y_dim_ > 0); - KALDI_ASSERT(input_z_dim_ > 0); - KALDI_ASSERT(pool_x_size_ > 0); - KALDI_ASSERT(pool_y_size_ > 0); - KALDI_ASSERT(pool_z_size_ > 0); - KALDI_ASSERT(pool_x_step_ > 0); - KALDI_ASSERT(pool_y_step_ > 0); - KALDI_ASSERT(pool_z_step_ > 0); - KALDI_ASSERT(input_x_dim_ >= pool_x_size_); - KALDI_ASSERT(input_y_dim_ >= pool_y_size_); - KALDI_ASSERT(input_z_dim_ >= pool_z_size_); - KALDI_ASSERT(pool_x_size_ >= pool_x_step_); - KALDI_ASSERT(pool_y_size_ >= pool_y_step_); - KALDI_ASSERT(pool_z_size_ >= pool_z_step_); - KALDI_ASSERT((input_x_dim_ - pool_x_size_) % pool_x_step_ == 0); - KALDI_ASSERT((input_y_dim_ - pool_y_size_) % pool_y_step_ == 0); - KALDI_ASSERT((input_z_dim_ - pool_z_size_) % pool_z_step_ == 0); -} - -// initialize the component using configuration file -void MaxpoolingComponent::InitFromConfig(ConfigLine *cfl) { - bool ok = true; - - ok = ok && cfl->GetValue("input-x-dim", &input_x_dim_); - ok = ok && cfl->GetValue("input-y-dim", &input_y_dim_); - ok = ok && cfl->GetValue("input-z-dim", &input_z_dim_); - ok = ok && cfl->GetValue("pool-x-size", &pool_x_size_); - ok = ok && cfl->GetValue("pool-y-size", &pool_y_size_); - ok = ok && cfl->GetValue("pool-z-size", &pool_z_size_); - ok = ok && cfl->GetValue("pool-x-step", &pool_x_step_); - ok = ok && cfl->GetValue("pool-y-step", &pool_y_step_); - ok = ok && cfl->GetValue("pool-z-step", &pool_z_step_); - - if (cfl->HasUnusedValues()) - KALDI_ERR << "Could not process these elements in initializer: " - << cfl->UnusedValues(); - if (!ok) - KALDI_ERR << "Bad initializer " << cfl->WholeLine(); - - Check(); -} - -// Method to convert from a matrix representing a minibatch of vectorized -// 3D tensors to patches for 3d max pooling, each patch corresponds to -// the nodes having the same local coordinatenodes from each pool -void MaxpoolingComponent::InputToInputPatches( - const CuMatrixBase& in, - CuMatrix *patches) const{ - int32 num_pools_x = 1 + (input_x_dim_ - pool_x_size_) / pool_x_step_; - int32 num_pools_y = 1 + (input_y_dim_ - pool_y_size_) / pool_y_step_; - int32 num_pools_z = 1 + (input_z_dim_ - pool_z_size_) / pool_z_step_; - - std::vector column_map(patches->NumCols()); - int32 column_map_size = column_map.size(); - for (int32 x = 0, index =0; x < pool_x_size_; x++) { - for (int32 y = 0; y < pool_y_size_; y++) { - for (int32 z = 0; z < pool_z_size_; z++) { - // given the local node coordinate, group them from each pool - // to form a patch - for (int32 x_pool = 0; x_pool < num_pools_x; x_pool++) { - for (int32 y_pool = 0; y_pool < num_pools_y; y_pool++) { - for (int32 z_pool = 0; z_pool < num_pools_z; z_pool++, index++) { - KALDI_ASSERT(index < column_map_size); - column_map[index] = (x_pool * pool_x_step_ + x) * input_y_dim_ * input_z_dim_ + - (y_pool * pool_y_step_ + y) * input_z_dim_ + - (z_pool * pool_z_step_ + z); - - } - } - } - } - } - } - CuArray cu_cols(column_map); - patches->CopyCols(in, cu_cols); -} - -/* - This is the 3d max pooling propagate function. - It is assumed that each row of the input matrix - is a vectorized 3D-tensor of type zxy. - Similar to the propagate function of ConvolutionComponent, - the input matrix is first arranged into patches so that - pools (with / without overlapping) could be - processed in a parallelizable manner. - The output matrix is also a vectorized 3D-tensor of type zxy. -*/ - -void* MaxpoolingComponent::Propagate(const ComponentPrecomputedIndexes *indexes, - const CuMatrixBase &in, - CuMatrixBase *out) const { - int32 num_frames = in.NumRows(); - int32 num_pools = OutputDim(); - int32 pool_size = pool_x_size_ * pool_y_size_ * pool_z_size_; - CuMatrix patches(num_frames, num_pools * pool_size, kUndefined); - InputToInputPatches(in, &patches); - - out->Set(-1e20); // reset a large negative value - for (int32 q = 0; q < pool_size; q++) - out->Max(patches.ColRange(q * num_pools, num_pools)); - return NULL; -} - -// Method to compute the input derivative matrix from the input derivatives -// for patches, where each patch corresponds to -// the nodes having the same local coordinatenodes from each pool -void MaxpoolingComponent::InderivPatchesToInderiv( - const CuMatrix& in_deriv_patches, - CuMatrixBase *in_deriv) const { - - int32 num_pools_x = 1 + (input_x_dim_ - pool_x_size_) / pool_x_step_; - int32 num_pools_y = 1 + (input_y_dim_ - pool_y_size_) / pool_y_step_; - int32 num_pools_z = 1 + (input_z_dim_ - pool_z_size_) / pool_z_step_; - - std::vector > reverse_column_map(in_deriv->NumCols()); - int32 rev_col_map_size = reverse_column_map.size(); - for (int32 x = 0, index = 0; x < pool_x_size_; x++) { - for (int32 y = 0; y < pool_y_size_; y++) { - for (int32 z = 0; z < pool_z_size_; z++) { - - for (int32 x_pool = 0; x_pool < num_pools_x; x_pool++) { - for (int32 y_pool = 0; y_pool < num_pools_y; y_pool++) { - for (int32 z_pool = 0; z_pool < num_pools_z; z_pool++, index++) { - int32 vector_index = (x_pool * pool_x_step_ + x) * input_y_dim_ * input_z_dim_ + - (y_pool * pool_y_step_ + y) * input_z_dim_ + - (z_pool * pool_z_step_ + z); - - KALDI_ASSERT(vector_index < rev_col_map_size); - reverse_column_map[vector_index].push_back(index); - } - } - } - } - } - } - std::vector > rearranged_column_map; - RearrangeIndexes(reverse_column_map, &rearranged_column_map); - for (int32 p = 0; p < rearranged_column_map.size(); p++) { - CuArray cu_cols(rearranged_column_map[p]); - in_deriv->AddCols(in_deriv_patches, cu_cols); - } -} - -/* - 3d max pooling backpropagate function - This function backpropagate the error from - out_deriv to in_deriv. - In order to select the node in each pool to - backpropagate the error, it has to compare - the output pool value stored in the out_value - matrix with each of its input pool member node - stroed in the in_value matrix. -*/ -void MaxpoolingComponent::Backprop(const std::string &debug_info, - const ComponentPrecomputedIndexes *indexes, - const CuMatrixBase &in_value, - const CuMatrixBase &out_value, - const CuMatrixBase &out_deriv, - void *memo, - Component *, // to_update, - CuMatrixBase *in_deriv) const { - if (!in_deriv) - return; - - int32 num_frames = in_value.NumRows(); - int32 num_pools = OutputDim(); - int32 pool_size = pool_x_size_ * pool_y_size_ * pool_z_size_; - CuMatrix patches(num_frames, num_pools * pool_size, kUndefined); - InputToInputPatches(in_value, &patches); - - for (int32 q = 0; q < pool_size; q++) { - // zero-out mask - CuMatrix mask; - out_value.EqualElementMask(patches.ColRange(q * num_pools, num_pools), &mask); - mask.MulElements(out_deriv); - patches.ColRange(q * num_pools, num_pools).CopyFromMat(mask); - } - - // combine the derivatives from the individual input deriv patches - // to compute input deriv matrix - InderivPatchesToInderiv(patches, in_deriv); -} - -void MaxpoolingComponent::Read(std::istream &is, bool binary) { - ExpectOneOrTwoTokens(is, binary, "", ""); - ReadBasicType(is, binary, &input_x_dim_); - ExpectToken(is, binary, ""); - ReadBasicType(is, binary, &input_y_dim_); - ExpectToken(is, binary, ""); - ReadBasicType(is, binary, &input_z_dim_); - ExpectToken(is, binary, ""); - ReadBasicType(is, binary, &pool_x_size_); - ExpectToken(is, binary, ""); - ReadBasicType(is, binary, &pool_y_size_); - ExpectToken(is, binary, ""); - ReadBasicType(is, binary, &pool_z_size_); - ExpectToken(is, binary, ""); - ReadBasicType(is, binary, &pool_x_step_); - ExpectToken(is, binary, ""); - ReadBasicType(is, binary, &pool_y_step_); - ExpectToken(is, binary, ""); - ReadBasicType(is, binary, &pool_z_step_); - ExpectToken(is, binary, ""); - Check(); -} - -void MaxpoolingComponent::Write(std::ostream &os, bool binary) const { - WriteToken(os, binary, ""); - WriteToken(os, binary, ""); - WriteBasicType(os, binary, input_x_dim_); - WriteToken(os, binary, ""); - WriteBasicType(os, binary, input_y_dim_); - WriteToken(os, binary, ""); - WriteBasicType(os, binary, input_z_dim_); - WriteToken(os, binary, ""); - WriteBasicType(os, binary, pool_x_size_); - WriteToken(os, binary, ""); - WriteBasicType(os, binary, pool_y_size_); - WriteToken(os, binary, ""); - WriteBasicType(os, binary, pool_z_size_); - WriteToken(os, binary, ""); - WriteBasicType(os, binary, pool_x_step_); - WriteToken(os, binary, ""); - WriteBasicType(os, binary, pool_y_step_); - WriteToken(os, binary, ""); - WriteBasicType(os, binary, pool_z_step_); - WriteToken(os, binary, ""); -} - -// display information about component -std::string MaxpoolingComponent::Info() const { - std::ostringstream stream; - stream << Type() - << ", input-x-dim=" << input_x_dim_ - << ", input-y-dim=" << input_y_dim_ - << ", input-z-dim=" << input_z_dim_ - << ", pool-x-size=" << pool_x_size_ - << ", pool-y-size=" << pool_y_size_ - << ", pool-z-size=" << pool_z_size_ - << ", pool-x-step=" << pool_x_step_ - << ", pool-y-step=" << pool_y_step_ - << ", pool-z-step=" << pool_z_step_; - return stream.str(); -} - void PermuteComponent::ComputeReverseColumnMap() { int32 dim = column_map_.Dim(); KALDI_ASSERT(dim > 0); @@ -5550,371 +4617,6 @@ void CompositeComponent::SetComponent(int32 i, Component *component) { components_[i] = component; } -int32 LstmNonlinearityComponent::InputDim() const { - int32 cell_dim = value_sum_.NumCols(); - return cell_dim * 5 + (use_dropout_ ? 3 : 0); -} - -int32 LstmNonlinearityComponent::OutputDim() const { - int32 cell_dim = value_sum_.NumCols(); - return cell_dim * 2; -} - - -void LstmNonlinearityComponent::Read(std::istream &is, bool binary) { - ReadUpdatableCommon(is, binary); // Read opening tag and learning rate. - ExpectToken(is, binary, ""); - params_.Read(is, binary); - ExpectToken(is, binary, ""); - value_sum_.Read(is, binary); - ExpectToken(is, binary, ""); - deriv_sum_.Read(is, binary); - ExpectToken(is, binary, ""); - self_repair_config_.Read(is, binary); - ExpectToken(is, binary, ""); - self_repair_total_.Read(is, binary); - - std::string tok; - ReadToken(is, binary, &tok); - if (tok == "") { - ReadBasicType(is, binary, &use_dropout_); - ReadToken(is, binary, &tok); - } else { - use_dropout_ = false; - } - KALDI_ASSERT(tok == ""); - ReadBasicType(is, binary, &count_); - - // For the on-disk format, we normalze value_sum_, deriv_sum_ and - // self_repair_total_ by dividing by the count, but in memory they are scaled - // by the count. [for self_repair_total_, the scaling factor is count_ * - // cell_dim]. - value_sum_.Scale(count_); - deriv_sum_.Scale(count_); - int32 cell_dim = params_.NumCols(); - self_repair_total_.Scale(count_ * cell_dim); - - InitNaturalGradient(); - - ExpectToken(is, binary, ""); - -} - -void LstmNonlinearityComponent::Write(std::ostream &os, bool binary) const { - WriteUpdatableCommon(os, binary); // Read opening tag and learning rate. - - WriteToken(os, binary, ""); - params_.Write(os, binary); - WriteToken(os, binary, ""); - { - Matrix value_avg(value_sum_); - if (count_ != 0.0) - value_avg.Scale(1.0 / count_); - value_avg.Write(os, binary); - } - WriteToken(os, binary, ""); - { - Matrix deriv_avg(deriv_sum_); - if (count_ != 0.0) - deriv_avg.Scale(1.0 / count_); - deriv_avg.Write(os, binary); - } - WriteToken(os, binary, ""); - self_repair_config_.Write(os, binary); - WriteToken(os, binary, ""); - { - int32 cell_dim = params_.NumCols(); - Vector self_repair_prob(self_repair_total_); - if (count_ != 0.0) - self_repair_prob.Scale(1.0 / (count_ * cell_dim)); - self_repair_prob.Write(os, binary); - } - if (use_dropout_) { - // only write this if true; we have back-compat code in reading anyway. - // this makes the models without dropout easier to read with older code. - WriteToken(os, binary, ""); - WriteBasicType(os, binary, use_dropout_); - } - WriteToken(os, binary, ""); - WriteBasicType(os, binary, count_); - WriteToken(os, binary, ""); -} - - - -std::string LstmNonlinearityComponent::Info() const { - std::ostringstream stream; - int32 cell_dim = params_.NumCols(); - stream << UpdatableComponent::Info() << ", cell-dim=" << cell_dim - << ", use-dropout=" << (use_dropout_ ? "true" : "false"); - PrintParameterStats(stream, "w_ic", params_.Row(0)); - PrintParameterStats(stream, "w_fc", params_.Row(1)); - PrintParameterStats(stream, "w_oc", params_.Row(2)); - - // Note: some of the following code mirrors the code in - // UpdatableComponent::Info(), in nnet-component-itf.cc. - if (count_ > 0) { - stream << ", count=" << std::setprecision(3) << count_ - << std::setprecision(6); - } - static const char *nonlin_names[] = { "i_t_sigmoid", "f_t_sigmoid", "c_t_tanh", - "o_t_sigmoid", "m_t_tanh" }; - for (int32 i = 0; i < 5; i++) { - stream << ", " << nonlin_names[i] << "={"; - stream << " self-repair-lower-threshold=" << self_repair_config_(i) - << ", self-repair-scale=" << self_repair_config_(i + 5); - - if (count_ != 0) { - BaseFloat self_repaired_proportion = - self_repair_total_(i) / (count_ * cell_dim); - stream << ", self-repaired-proportion=" << self_repaired_proportion; - Vector value_sum(value_sum_.Row(i)), - deriv_sum(deriv_sum_.Row(i)); - Vector value_avg(value_sum), deriv_avg(deriv_sum); - value_avg.Scale(1.0 / count_); - deriv_avg.Scale(1.0 / count_); - stream << ", value-avg=" << SummarizeVector(value_avg) - << ", deriv-avg=" << SummarizeVector(deriv_avg); - } - stream << " }"; - } - return stream.str(); -} - - -Component* LstmNonlinearityComponent::Copy() const { - return new LstmNonlinearityComponent(*this); -} - -void LstmNonlinearityComponent::ZeroStats() { - value_sum_.SetZero(); - deriv_sum_.SetZero(); - self_repair_total_.SetZero(); - count_ = 0.0; -} - -void LstmNonlinearityComponent::Scale(BaseFloat scale) { - if (scale == 0.0) { - params_.SetZero(); - value_sum_.SetZero(); - deriv_sum_.SetZero(); - self_repair_total_.SetZero(); - count_ = 0.0; - } else { - params_.Scale(scale); - value_sum_.Scale(scale); - deriv_sum_.Scale(scale); - self_repair_total_.Scale(scale); - count_ *= scale; - } -} - -void LstmNonlinearityComponent::Add(BaseFloat alpha, - const Component &other_in) { - const LstmNonlinearityComponent *other = - dynamic_cast(&other_in); - KALDI_ASSERT(other != NULL); - params_.AddMat(alpha, other->params_); - value_sum_.AddMat(alpha, other->value_sum_); - deriv_sum_.AddMat(alpha, other->deriv_sum_); - self_repair_total_.AddVec(alpha, other->self_repair_total_); - count_ += alpha * other->count_; -} - -void LstmNonlinearityComponent::PerturbParams(BaseFloat stddev) { - CuMatrix temp_params(params_.NumRows(), params_.NumCols()); - temp_params.SetRandn(); - params_.AddMat(stddev, temp_params); -} - -BaseFloat LstmNonlinearityComponent::DotProduct( - const UpdatableComponent &other_in) const { - const LstmNonlinearityComponent *other = - dynamic_cast(&other_in); - KALDI_ASSERT(other != NULL); - return TraceMatMat(params_, other->params_, kTrans); -} - -int32 LstmNonlinearityComponent::NumParameters() const { - return params_.NumRows() * params_.NumCols(); -} - -void LstmNonlinearityComponent::Vectorize(VectorBase *params) const { - KALDI_ASSERT(params->Dim() == NumParameters()); - params->CopyRowsFromMat(params_); -} - - -void LstmNonlinearityComponent::UnVectorize( - const VectorBase ¶ms) { - KALDI_ASSERT(params.Dim() == NumParameters()); - params_.CopyRowsFromVec(params); -} - - -void* LstmNonlinearityComponent::Propagate( - const ComponentPrecomputedIndexes *, // indexes - const CuMatrixBase &in, - CuMatrixBase *out) const { - cu::ComputeLstmNonlinearity(in, params_, out); - return NULL; -} - - -void LstmNonlinearityComponent::Backprop( - const std::string &debug_info, - const ComponentPrecomputedIndexes *indexes, - const CuMatrixBase &in_value, - const CuMatrixBase &, // out_value, - const CuMatrixBase &out_deriv, - void *memo, - Component *to_update_in, - CuMatrixBase *in_deriv) const { - - if (to_update_in == NULL) { - cu::BackpropLstmNonlinearity(in_value, params_, out_deriv, - deriv_sum_, self_repair_config_, - count_, in_deriv, - (CuMatrixBase*) NULL, - (CuMatrixBase*) NULL, - (CuMatrixBase*) NULL, - (CuMatrixBase*) NULL); - } else { - LstmNonlinearityComponent *to_update = - dynamic_cast(to_update_in); - KALDI_ASSERT(to_update != NULL); - - int32 cell_dim = params_.NumCols(); - CuMatrix params_deriv(3, cell_dim, kUndefined); - CuMatrix self_repair_total(5, cell_dim, kUndefined); - - cu::BackpropLstmNonlinearity(in_value, params_, out_deriv, - deriv_sum_, self_repair_config_, - count_, in_deriv, ¶ms_deriv, - &(to_update->value_sum_), - &(to_update->deriv_sum_), - &self_repair_total); - - CuVector self_repair_total_sum(5); - self_repair_total_sum.AddColSumMat(1.0, self_repair_total, 0.0); - to_update->self_repair_total_.AddVec(1.0, self_repair_total_sum); - to_update->count_ += static_cast(in_value.NumRows()); - - BaseFloat scale = 1.0; - if (!to_update->is_gradient_) { - to_update->preconditioner_.PreconditionDirections( - ¶ms_deriv, &scale); - } - to_update->params_.AddMat(to_update->learning_rate_ * scale, - params_deriv); - } -} - -LstmNonlinearityComponent::LstmNonlinearityComponent( - const LstmNonlinearityComponent &other): - UpdatableComponent(other), - params_(other.params_), - use_dropout_(other.use_dropout_), - value_sum_(other.value_sum_), - deriv_sum_(other.deriv_sum_), - self_repair_config_(other.self_repair_config_), - self_repair_total_(other.self_repair_total_), - count_(other.count_), - preconditioner_(other.preconditioner_) { } - -void LstmNonlinearityComponent::Init( - int32 cell_dim, bool use_dropout, - BaseFloat param_stddev, - BaseFloat tanh_self_repair_threshold, - BaseFloat sigmoid_self_repair_threshold, - BaseFloat self_repair_scale) { - KALDI_ASSERT(cell_dim > 0 && param_stddev >= 0.0 && - tanh_self_repair_threshold >= 0.0 && - tanh_self_repair_threshold <= 1.0 && - sigmoid_self_repair_threshold >= 0.0 && - sigmoid_self_repair_threshold <= 0.25 && - self_repair_scale >= 0.0 && self_repair_scale <= 0.1); - use_dropout_ = use_dropout; - params_.Resize(3, cell_dim); - params_.SetRandn(); - params_.Scale(param_stddev); - value_sum_.Resize(5, cell_dim); - deriv_sum_.Resize(5, cell_dim); - self_repair_config_.Resize(10); - self_repair_config_.Range(0, 5).Set(sigmoid_self_repair_threshold); - self_repair_config_(2) = tanh_self_repair_threshold; - self_repair_config_(4) = tanh_self_repair_threshold; - self_repair_config_.Range(5, 5).Set(self_repair_scale); - self_repair_total_.Resize(5); - count_ = 0.0; - InitNaturalGradient(); - -} - -void LstmNonlinearityComponent::InitNaturalGradient() { - // As regards the configuration for the natural-gradient preconditioner, we - // don't make it configurable from the command line-- it's unlikely that any - // differences from changing this would be substantial enough to effectively - // tune the configuration. Because the preconditioning code doesn't 'see' the - // derivatives from individual frames, but only averages over the minibatch, - // there is a fairly small amount of data available to estimate the Fisher - // information matrix, so we set the rank, update period and - // num-samples-history to smaller values than normal. - preconditioner_.SetRank(20); - preconditioner_.SetUpdatePeriod(2); - preconditioner_.SetNumSamplesHistory(1000.0); -} - -/// virtual -void LstmNonlinearityComponent::FreezeNaturalGradient(bool freeze) { - preconditioner_.Freeze(freeze); -} - -void LstmNonlinearityComponent::InitFromConfig(ConfigLine *cfl) { - InitLearningRatesFromConfig(cfl); - bool ok = true; - bool use_dropout = false; - int32 cell_dim; - // these self-repair thresholds are the normal defaults for tanh and sigmoid - // respectively. If, later on, we decide that we want to support different - // self-repair config values for the individual sigmoid and tanh - // nonlinearities, we can modify this code then. - BaseFloat tanh_self_repair_threshold = 0.2, - sigmoid_self_repair_threshold = 0.05, - self_repair_scale = 1.0e-05; - // param_stddev is the stddev of the parameters. it may be better to - // use a smaller value but this was the default in the python scripts - // for a while. - BaseFloat param_stddev = 1.0; - ok = ok && cfl->GetValue("cell-dim", &cell_dim); - cfl->GetValue("param-stddev", ¶m_stddev); - cfl->GetValue("tanh-self-repair-threshold", - &tanh_self_repair_threshold); - cfl->GetValue("sigmoid-self-repair-threshold", - &sigmoid_self_repair_threshold); - cfl->GetValue("self-repair-scale", &self_repair_scale); - cfl->GetValue("use-dropout", &use_dropout); - - // We may later on want to make it possible to initialize the different - // parameters w_ic, w_fc and w_oc with different biases. We'll implement - // that when and if it's needed. - - if (cfl->HasUnusedValues()) - KALDI_ERR << "Could not process these elements in initializer: " - << cfl->UnusedValues(); - if (ok) { - Init(cell_dim, use_dropout, param_stddev, tanh_self_repair_threshold, - sigmoid_self_repair_threshold, self_repair_scale); - } else { - KALDI_ERR << "Invalid initializer for layer of type " - << Type() << ": \"" << cfl->WholeLine() << "\""; - } -} - -void LstmNonlinearityComponent::ConsolidateMemory() { - OnlineNaturalGradient preconditioner_temp(preconditioner_); - preconditioner_.Swap(&preconditioner_); -} SumBlockComponent::SumBlockComponent(const SumBlockComponent &other): input_dim_(other.input_dim_), output_dim_(other.output_dim_), diff --git a/src/nnet3/nnet-simple-component.h b/src/nnet3/nnet-simple-component.h index 11c60f8f352..546176f71ee 100644 --- a/src/nnet3/nnet-simple-component.h +++ b/src/nnet3/nnet-simple-component.h @@ -1,9 +1,9 @@ // nnet3/nnet-simple-component.h // Copyright 2011-2013 Karel Vesely -// 2012-2015 Johns Hopkins University (author: Daniel Povey) +// 2012-2017 Johns Hopkins University (author: Daniel Povey) // 2013 Xiaohui Zhang -// 2014-2015 Vijayaditya Peddinti +// 2014-2016 Vijayaditya Peddinti // 2014-2015 Guoguo Chen // 2015 Daniel Galvez // 2015 Tom Ko @@ -42,7 +42,7 @@ namespace nnet3 { /// nnet-general-component.h there are components that don't fit this pattern. /// /// Some components that do provide the kSimpleComponent flag are not declared -/// here: see also nnet-normalize-component.h. +/// here: see also nnet-normalize-component.h and nnet-combined-component.h // This "nnet3" version of the p-norm component only supports the 2-norm. class PnormComponent: public Component { @@ -756,7 +756,7 @@ class LogSoftmaxComponent: public NonlinearComponent { Configuration values accepted by this component: Values inherited from UpdatableComponent (see its declaration in - nnet-component-itf for details): + nnet-component-itf.h for details): learning-rate learning-rate-factor max-change @@ -817,8 +817,8 @@ class LogSoftmaxComponent: public NonlinearComponent { matrix in the input space. default=20. rank-out Rank used in low-rank-plus-unit estimate of Fisher matrix in the output-derivative space. default=80. - update-period Determines after with what frequency (in - minibatches) we update the Fisher-matrix estimates; + update-period Determines the period (in minibatches) with which + we update the Fisher-matrix estimates; making this > 1 saves a little time in training. default=4. */ @@ -1815,7 +1815,6 @@ class NaturalGradientPerElementScaleComponent: public PerElementScaleComponent { }; - /* ScaleAndOffsetComponent implements a per-element scale and offset. It may be useful just after BatchNormComponent, as the trainable offset @@ -1947,523 +1946,6 @@ class ScaleAndOffsetComponent: public UpdatableComponent { }; - -/** - * WARNING, this component is deprecated in favor of - * TimeHeightConvolutionComponent, and will be deleted. - * ConvolutionalComponent implements 2d-convolution. - * It uses 3D filters on 3D inputs, but the 3D filters hop only over - * 2 dimensions as it has same size as the input along the 3rd dimension. - * Input : A matrix where each row is a vectorized 3D-tensor. - * The 3D tensor has dimensions - * x: (e.g. time) - * y: (e.g. frequency) - * z: (e.g. channels like features/delta/delta-delta) - * - * The component supports input vectorizations of type zyx and yzx. - * The default vectorization type is zyx. - * e.g. for input vectorization of type zyx the input is vectorized by - * spanning axes z, y and x of the tensor in that order. - * Given 3d tensor A with sizes (2, 2, 2) along the three dimensions - * the zyx vectorized input looks like - * A(0,0,0) A(0,0,1) A(0,1,0) A(0,1,1) A(1,0,0) A(1,0,1) A(1,1,0) A(1,1,1) - * - * - * Output : The output is also a 3D tensor vectorized in the zyx format. - * The channel axis (z) in the output corresponds to the output of - * different filters. The first channel corresponds to the first filter - * i.e., first row of the filter_params_ matrix. - * - * Note: The component has to support yzx input vectorization as the binaries - * like add-deltas generate yz vectorized output. These input vectors are - * concatenated using the Append descriptor across time steps to form a yzx - * vectorized 3D tensor input. - * e.g. Append(Offset(input, -1), input, Offset(input, 1)) - * - * - * For information on the hyperparameters and parameters of this component see - * the variable declarations. - * - * Propagation: - * ------------ - * Convolution operation consists of a dot-products between the filter tensor - * and input tensor patch, for various shifts of filter tensor along the x and y - * axes input tensor. (Note: there is no shift along z-axis as the filter and - * input tensor have same size along this axis). - * - * For a particular shift (i,j) of the filter tensor - * along input tensor dimensions x and y, the elements of the input tensor which - * overlap with the filter form the input tensor patch. This patch is vectorized - * in zyx format. All the patches corresponding to various samples in the - * mini-batch are stacked into a matrix, where each row corresponds to one - * patch. Let this matrix be represented by X_{i,j}. The dot products with - * various filters are computed simultaneously by computing the matrix product - * with the filter_params_ matrix (W) - * Y_{i,j} = X_{i,j}*W^T. - * Each row of W corresponds to one filter 3D tensor vectorized in zyx format. - * - * All the matrix products corresponding to various shifts (i,j) of the - * filter tensor are computed simultaneously using the AddMatMatBatched - * call of CuMatrixBase class. - * - * BackPropagation: - * ---------------- - * Backpropagation to compute the input derivative (\nabla X_{i,j}) - * consists of the a series of matrix products. - * \nablaX_{i,j} = \nablaY_{i,j}*W where \nablaY_{i,j} corresponds to the - * output derivative for a particular shift of the filter. - * - * Once again these matrix products are computed simultaneously. - * - * Update: - * ------- - * The weight gradient is computed as - * \nablaW = \Sum_{i,j} (X_{i,j}^T *\nablaY_{i,j}) - * - */ -class ConvolutionComponent: public UpdatableComponent { - public: - enum TensorVectorizationType { - kYzx = 0, - kZyx = 1 - }; - - ConvolutionComponent(); - // constructor using another component - ConvolutionComponent(const ConvolutionComponent &component); - // constructor using parameters - ConvolutionComponent( - const CuMatrixBase &filter_params, - const CuVectorBase &bias_params, - int32 input_x_dim, int32 input_y_dim, int32 input_z_dim, - int32 filt_x_dim, int32 filt_y_dim, - int32 filt_x_step, int32 filt_y_step, - TensorVectorizationType input_vectorization, - BaseFloat learning_rate); - - virtual int32 InputDim() const; - virtual int32 OutputDim() const; - - virtual std::string Info() const; - virtual void InitFromConfig(ConfigLine *cfl); - virtual std::string Type() const { return "ConvolutionComponent"; } - virtual int32 Properties() const { - return kSimpleComponent|kUpdatableComponent|kBackpropNeedsInput| - kBackpropAdds|kPropagateAdds; - } - - virtual void* Propagate(const ComponentPrecomputedIndexes *indexes, - const CuMatrixBase &in, - CuMatrixBase *out) const; - virtual void Backprop(const std::string &debug_info, - const ComponentPrecomputedIndexes *indexes, - const CuMatrixBase &in_value, - const CuMatrixBase &, // out_value, - const CuMatrixBase &out_deriv, - void *memo, - Component *to_update_in, - CuMatrixBase *in_deriv) const; - void Update(const std::string &debug_info, - const CuMatrixBase &in_value, - const CuMatrixBase &out_deriv, - const std::vector *>& out_deriv_batch); - - - virtual void Read(std::istream &is, bool binary); - virtual void Write(std::ostream &os, bool binary) const; - - virtual Component* Copy() const; - - // Some functions from base-class UpdatableComponent. - virtual void Scale(BaseFloat scale); - virtual void Add(BaseFloat alpha, const Component &other); - virtual void PerturbParams(BaseFloat stddev); - virtual BaseFloat DotProduct(const UpdatableComponent &other) const; - virtual int32 NumParameters() const; - virtual void Vectorize(VectorBase *params) const; - virtual void UnVectorize(const VectorBase ¶ms); - - // Some functions that are specific to this class. - void SetParams(const VectorBase &bias, - const MatrixBase &filter); - const CuVector &BiasParams() const { return bias_params_; } - const CuMatrix &LinearParams() const { return filter_params_; } - void Init(int32 input_x_dim, int32 input_y_dim, int32 input_z_dim, - int32 filt_x_dim, int32 filt_y_dim, - int32 filt_x_step, int32 filt_y_step, int32 num_filters, - TensorVectorizationType input_vectorization, - BaseFloat param_stddev, BaseFloat bias_stddev); - // there is no filt_z_dim parameter as the length of the filter along - // z-dimension is same as the input - void Init(int32 input_x_dim, int32 input_y_dim, int32 input_z_dim, - int32 filt_x_dim, int32 filt_y_dim, - int32 filt_x_step, int32 filt_y_step, - TensorVectorizationType input_vectorization, - std::string matrix_filename); - - // resize the component, setting the parameters to zero, while - // leaving any other configuration values the same - void Resize(int32 input_dim, int32 output_dim); - - void Update(const std::string &debug_info, - const CuMatrixBase &in_value, - const CuMatrixBase &out_deriv); - - - private: - int32 input_x_dim_; // size of the input along x-axis - // (e.g. number of time steps) - - int32 input_y_dim_; // size of input along y-axis - // (e.g. number of mel-frequency bins) - - int32 input_z_dim_; // size of input along z-axis - // (e.g. number of channels is 3 if the input has - // features + delta + delta-delta features - - int32 filt_x_dim_; // size of the filter along x-axis - - int32 filt_y_dim_; // size of the filter along y-axis - - // there is no filt_z_dim_ as it is always assumed to be - // the same as input_z_dim_ - - int32 filt_x_step_; // the number of steps taken along x-axis of input - // before computing the next dot-product - // of filter and input - - int32 filt_y_step_; // the number of steps taken along y-axis of input - // before computing the next dot-product of the filter - // and input - - // there is no filt_z_step_ as only dot product is possible along this axis - - TensorVectorizationType input_vectorization_; // type of vectorization of the - // input 3D tensor. Accepts zyx and yzx formats - - CuMatrix filter_params_; - // the filter (or kernel) matrix is a matrix of vectorized 3D filters - // where each row in the matrix corresponds to one filter. - // The 3D filter tensor is vectorizedin zyx format. - // The first row of the matrix corresponds to the first filter and so on. - // Keep in mind the vectorization type and order of filters when using file - // based initialization. - - CuVector bias_params_; - // the filter-specific bias vector (i.e., there is a seperate bias added - // to the output of each filter). - - void InputToInputPatches(const CuMatrixBase& in, - CuMatrix *patches) const; - void InderivPatchesToInderiv(const CuMatrix& in_deriv_patches, - CuMatrixBase *in_deriv) const; - const ConvolutionComponent &operator = (const ConvolutionComponent &other); // Disallow. -}; - - -/* - LstmNonlinearityComponent is a component that implements part of an LSTM, by - combining together the sigmoids and tanh's, plus some diagonal terms, into - a single block. - We will refer to the LSTM formulation used in - - Long Short-Term Memory Recurrent Neural Network Architectures for Large Scale Acoustic Modeling" - by H. Sak et al, - http://static.googleusercontent.com/media/research.google.com/en//pubs/archive/43905.pdf. - - Suppose the cell dimension is C. Then outside this component, we compute - the 4 * C-dimensional quantity consisting of 4 blocks as follows, by a single - matrix multiplication: - - i_part = W_{ix} x_t + W_{im} m_{t-1} + b_i - f_part = W_{fx} x_t + W_{fm} m_{t-1} + b_f - c_part = W_{cx} x_t + W_{cm} m_{t-1} + b_c - o_part = W_{cx} x_t + W_{om} m_{t-1} + b_o - - The part of the computation that takes place in this component is as follows. - Its input is of dimension 5C [however, search for 'dropout' below], - consisting of 5 blocks: (i_part, f_part, c_part, o_part, and c_{t-1}). Its - output is of dimension 2C, consisting of 2 blocks: c_t and m_t. - - To recap: the input is (i_part, f_part, c_part, o_part, c_{t-1}); the output is (c_t, m_t). - - This component has parameters, 3C of them in total: the diagonal matrices w_i, w_f - and w_o. - - - In the forward pass (Propagate), this component computes the following: - - i_t = Sigmoid(i_part + w_{ic}*c_{t-1}) (1) - f_t = Sigmoid(f_part + w_{fc}*c_{t-1}) (2) - c_t = f_t*c_{t-1} + i_t * Tanh(c_part) (3) - o_t = Sigmoid(o_part + w_{oc}*c_t) (4) - m_t = o_t * Tanh(c_t) (5) - # note: the outputs are just c_t and m_t. - - [Note regarding dropout: optionally the input-dimension may be 5C + 3 instead - of 5C in this case, the last three input dimensions will be interpreted as - per-frame dropout masks on i_t, f_t and o_t respectively, so that on the RHS of - (3), i_t is replaced by i_t * i_t_scale, and likewise for f_t and o_t.] - - The backprop is as you would think, but for the "self-repair" we need to pass - in additional vectors (of the same dim as the parameters of the layer) that - dictate whether or not we add an additional term to the backpropagated - derivatives. (This term helps force the input to the nonlinearities into the - range where the derivatives are not too small). - - This component stores stats of the same form as are normally stored by the - StoreStats() functions for the sigmoid and tanh units, i.e. averages of the - activations and derivatives, but this is done inside the Backprop() functions. - [the StoreStats() functions don't take the input data as an argument, so - storing this data that way is impossible, and anyway it's more efficient to - do it as part of backprop.] - - Configuration values accepted: - cell-dim e.g. cell-dim=1024 Cell dimension. The input - dimension of this component is cell-dim * 5, and the - output dimension is cell-dim * 2. Note: this - component implements only part of the LSTM layer, - see comments above. - param-stddev Standard deviation for random initialization of - the diagonal matrices (AKA peephole connections). - default=1.0, which is probably too high but - we couldn't see any reliable gain from decreasing it. - tanh-self-repair-threshold Equivalent to the self-repair-lower-threshold - in a TanhComponent; applies to both the tanh nonlinearities. - default=0.2, you probably won't want to changethis. - sigmoid-self-repair-threshold Equivalent to self-repair-lower-threshold - in a SigmoidComponent; applies to all three of the sigmoid - nonlinearities. default=0.05, you probably won't want to - change this. - self-repair-scale Equivalent to the self-repair-scale in a SigmoidComponent - or TanhComponent; applies to both the sigmoid and tanh - nonlinearities. default=1.0e-05, which you probably won't - want to change unless dealing with an objective function - that has smaller or larger dynamic range than normal, in - which case you might want to make it smaller or larger. -*/ -class LstmNonlinearityComponent: public UpdatableComponent { - public: - - virtual int32 InputDim() const; - virtual int32 OutputDim() const; - virtual std::string Info() const; - virtual void InitFromConfig(ConfigLine *cfl); - LstmNonlinearityComponent(): use_dropout_(false) { } - virtual std::string Type() const { return "LstmNonlinearityComponent"; } - virtual int32 Properties() const { - return kSimpleComponent|kUpdatableComponent|kBackpropNeedsInput; - } - - virtual void* Propagate(const ComponentPrecomputedIndexes *indexes, - const CuMatrixBase &in, - CuMatrixBase *out) const; - virtual void Backprop(const std::string &debug_info, - const ComponentPrecomputedIndexes *indexes, - const CuMatrixBase &in_value, - const CuMatrixBase &, // out_value, - const CuMatrixBase &out_deriv, - void *memo, - Component *to_update_in, - CuMatrixBase *in_deriv) const; - - virtual void Read(std::istream &is, bool binary); - virtual void Write(std::ostream &os, bool binary) const; - - virtual Component* Copy() const; - - // Some functions from base-class UpdatableComponent. - virtual void Scale(BaseFloat scale); - virtual void Add(BaseFloat alpha, const Component &other); - virtual void PerturbParams(BaseFloat stddev); - virtual BaseFloat DotProduct(const UpdatableComponent &other) const; - virtual int32 NumParameters() const; - virtual void Vectorize(VectorBase *params) const; - virtual void UnVectorize(const VectorBase ¶ms); - virtual void ZeroStats(); - virtual void FreezeNaturalGradient(bool freeze); - - // Some functions that are specific to this class: - explicit LstmNonlinearityComponent( - const LstmNonlinearityComponent &other); - - void Init(int32 cell_dim, bool use_dropout, - BaseFloat param_stddev, - BaseFloat tanh_self_repair_threshold, - BaseFloat sigmoid_self_repair_threshold, - BaseFloat self_repair_scale); - - virtual void ConsolidateMemory(); - - private: - - // Initializes the natural-gradient object with the configuration we - // use for this object, which for now is hardcoded at the C++ level. - void InitNaturalGradient(); - - // Notation: C is the cell dimension; it equals params_.NumCols(). - - // The dimension of the parameter matrix is (3 x C); - // it contains the 3 diagonal parameter matrices w_i, w_f and w_o. - CuMatrix params_; - - // If true, we expect an extra 3 dimensions on the input, for dropout masks - // for i_t and f_t. - bool use_dropout_; - - // Of dimension 5 * C, with a row for each of the Sigmoid/Tanh functions in - // equations (1) through (5), this is the sum of the values of the nonliearities - // (used for diagnostics only). It is comparable to value_sum_ vector - // in base-class NonlinearComponent. - CuMatrix value_sum_; - - // Of dimension 5 * C, with a row for each of the Sigmoid/Tanh functions in - // equations (1) through (5), this is the sum of the derivatives of the - // nonliearities (used for diagnostics and to control self-repair). It is - // comparable to the deriv_sum_ vector in base-class - // NonlinearComponent. - CuMatrix deriv_sum_; - - // This matrix has dimension 10. The contents are a block of 5 self-repair - // thresholds (typically "0.05 0.05 0.2 0.05 0.2"), then a block of 5 - // self-repair scales (typically all 0.00001). These are for each of the 5 - // nonlinearities in the LSTM component in turn (see comments in cu-math.h for - // more info). - CuVector self_repair_config_; - - // This matrix has dimension 5. For each of the 5 nonlinearities in the LSTM - // component (see comments in cu-math.h for more info), it contains the total, - // over all frames represented in count_, of the number of dimensions that - // were subject to self_repair. To get the self-repair proportion you should - // divide by (count_ times cell_dim_). - CuVector self_repair_total_; - - // The total count (number of frames) corresponding to the stats in value_sum_ - // and deriv_sum_. - double count_; - - // Preconditioner for the parameters of this component [operates in the space - // of dimension C]. - // The preconditioner stores its own configuration values; we write and read - // these, but not the preconditioner object itself. - OnlineNaturalGradient preconditioner_; - - const LstmNonlinearityComponent &operator - = (const LstmNonlinearityComponent &other); // Disallow. -}; - - - - -/* - * WARNING, this component is deprecated as it's not compatible with - * TimeHeightConvolutionComponent, and it will eventually be deleted. - * MaxPoolingComponent : - * Maxpooling component was firstly used in ConvNet for selecting an - * representative activation in an area. It inspired Maxout nonlinearity. - * Each output element of this component is the maximum of a block of - * input elements where the block has a 3D dimension (pool_x_size_, - * pool_y_size_, pool_z_size_). - * Blocks could overlap if the shift value on any axis is smaller - * than its corresponding pool size (e.g. pool_x_step_ < pool_x_size_). - * If the shift values are euqal to their pool size, there is no - * overlap; while if they all equal 1, the blocks overlap to - * the greatest possible extent. - * - * This component is designed to be used after a ConvolutionComponent - * so that the input matrix is propagated from a 2d-convolutional layer. - * This component implements 3d-maxpooling which performs - * max pooling along the three axes. - * Input : A matrix where each row is a vectorized 3D-tensor. - * The 3D tensor has dimensions - * x: (e.g. time) - * y: (e.g. frequency) - * z: (e.g. channels like number of filters in the ConvolutionComponent) - * - * The component assumes input vectorizations of type zyx - * which is the default output vectorization type of a ConvolutionComponent. - * e.g. for input vectorization of type zyx the input is vectorized by - * spanning axes z, y and x of the tensor in that order. - * Given 3d tensor A with sizes (2, 2, 2) along the three dimensions - * the zyx vectorized input looks like - * A(0,0,0) A(0,0,1) A(0,1,0) A(0,1,1) A(1,0,0) A(1,0,1) A(1,1,0) A(1,1,1) - * - * Output : The output is also a 3D tensor vectorized in the zyx format. - * - * For information on the hyperparameters and parameters of this component see - * the variable declarations. - * - * - */ -class MaxpoolingComponent: public Component { - public: - - MaxpoolingComponent(): input_x_dim_(0), input_y_dim_(0), input_z_dim_(0), - pool_x_size_(0), pool_y_size_(0), pool_z_size_(0), - pool_x_step_(0), pool_y_step_(0), pool_z_step_(0) { } - // constructor using another component - MaxpoolingComponent(const MaxpoolingComponent &component); - - virtual int32 InputDim() const; - virtual int32 OutputDim() const; - - virtual std::string Info() const; - virtual void InitFromConfig(ConfigLine *cfl); - virtual std::string Type() const { return "MaxpoolingComponent"; } - virtual int32 Properties() const { - return kSimpleComponent|kBackpropNeedsInput|kBackpropNeedsOutput| - kBackpropAdds; - } - - virtual void* Propagate(const ComponentPrecomputedIndexes *indexes, - const CuMatrixBase &in, - CuMatrixBase *out) const; - virtual void Backprop(const std::string &debug_info, - const ComponentPrecomputedIndexes *indexes, - const CuMatrixBase &in_value, - const CuMatrixBase &out_value, - const CuMatrixBase &out_deriv, - void *memo, - Component *, // to_update, - CuMatrixBase *in_deriv) const; - - virtual void Read(std::istream &is, bool binary); // This Read function - // requires that the Component has the correct type. - - /// Write component to stream - virtual void Write(std::ostream &os, bool binary) const; - virtual Component* Copy() const { return new MaxpoolingComponent(*this); } - - - protected: - void InputToInputPatches(const CuMatrixBase& in, - CuMatrix *patches) const; - void InderivPatchesToInderiv(const CuMatrix& in_deriv_patches, - CuMatrixBase *in_deriv) const; - virtual void Check() const; - - - int32 input_x_dim_; // size of the input along x-axis - // (e.g. number of time steps) - int32 input_y_dim_; // size of input along y-axis - // (e.g. number of mel-frequency bins) - int32 input_z_dim_; // size of input along z-axis - // (e.g. number of filters in the ConvolutionComponent) - - int32 pool_x_size_; // size of the pooling window along x-axis - int32 pool_y_size_; // size of the pooling window along y-axis - int32 pool_z_size_; // size of the pooling window along z-axis - - int32 pool_x_step_; // the number of steps taken along x-axis of input - // before computing the next pool - int32 pool_y_step_; // the number of steps taken along y-axis of input - // before computing the next pool - int32 pool_z_step_; // the number of steps taken along z-axis of input - // before computing the next pool - -}; - - /** CompositeComponent is a component representing a sequence of [simple] components. The config line would be something like the following diff --git a/src/nnet3/nnet-test-utils.cc b/src/nnet3/nnet-test-utils.cc index bae332cd584..a8ef30bc314 100644 --- a/src/nnet3/nnet-test-utils.cc +++ b/src/nnet3/nnet-test-utils.cc @@ -1400,7 +1400,7 @@ void ComputeExampleComputationRequestSimple( static void GenerateRandomComponentConfig(std::string *component_type, std::string *config) { - int32 n = RandInt(0, 35); + int32 n = RandInt(0, 37); BaseFloat learning_rate = 0.001 * RandInt(1, 100); std::ostringstream os; @@ -1757,6 +1757,22 @@ static void GenerateRandomComponentConfig(std::string *component_type, << " use-bias=" << (RandInt(0,1) == 0 ? "true":"false"); break; } + case 36: { + *component_type = "GruNonlinearityComponent"; + int32 cell_dim = RandInt(10, 20); + int32 recurrent_dim = (RandInt(0, 1) == 0 ? + RandInt(5, cell_dim - 1) : cell_dim); + os << "cell-dim=" << cell_dim + << " recurrent-dim=" << recurrent_dim; + break; + } + case 37: { + *component_type = "OutputGruNonlinearityComponent"; + os << "cell-dim=" << RandInt(10, 20) + << " learning-rate=" << learning_rate; + + break; + } default: KALDI_ERR << "Error generating random component"; }