From 1279778ca16acd9d62b42c4dab65d0d66c49e754 Mon Sep 17 00:00:00 2001
From: LvHang <hanglv@nwpu-aslp.org>
Date: Sat, 15 Sep 2018 20:42:15 -0400
Subject: [PATCH] Add GruNonlinearityComponent(by Dan) and
 OutputGruNonlinearityComponent; moving aroun some sources in nnet3 to avoid
 very large files

rename nnet-combined-component.{h,cc} and str case

Update get_saturation.pl for fast gru version. Get matched resutls
---
 .../local/chain/tuning/run_tdnn_opgru_1a.sh   |   25 +-
 .../local/chain/tuning/run_tdnn_opgru_1b.sh   |  315 +++
 egs/wsj/s5/steps/libs/nnet3/xconfig/gru.py    | 1080 +++++++-
 egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py |    5 +
 egs/wsj/s5/steps/nnet3/get_saturation.pl      |    8 +
 src/nnet3/Makefile                            |    2 +-
 src/nnet3/nnet-combined-component.cc          | 2332 +++++++++++++++++
 src/nnet3/nnet-combined-component.h           | 1109 ++++++++
 src/nnet3/nnet-component-itf.cc               |    5 +
 src/nnet3/nnet-simple-component.cc            | 1300 +--------
 src/nnet3/nnet-simple-component.h             |  530 +---
 src/nnet3/nnet-test-utils.cc                  |   18 +-
 12 files changed, 4889 insertions(+), 1840 deletions(-)
 create mode 100755 egs/swbd/s5c/local/chain/tuning/run_tdnn_opgru_1b.sh
 create mode 100644 src/nnet3/nnet-combined-component.cc
 create mode 100644 src/nnet3/nnet-combined-component.h

diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_opgru_1a.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_opgru_1a.sh
index b1426bc22b7..18d3f81ffde 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_opgru_1a.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_opgru_1a.sh
@@ -4,31 +4,36 @@
 
 # This is based on TDNN_LSTM_1b, but using the NormOPGRU to replace the LSTMP,
 # and adding chunk-{left,right}-context-initial=0
+# For the details of OPGRU structure, please check the paper
+# "Output-Gate Projected Gated Recurrent Unit for Speech Recognition"
+# by Gaofeng Cheng et al,
+# http://www.danielpovey.com/files/2018_interspeech_opgru.pdf
+
 # Different from the vanilla OPGRU, Norm-OPGRU adds batchnorm in its output (forward direction)
 # and renorm in its recurrence. Experiments show that the TDNN-NormOPGRU could achieve similar
 # results than TDNN-LSTMP and BLSTMP in both large or small data sets (80 ~ 2300 Hrs).
 
 # ./local/chain/compare_wer_general.sh --looped tdnn_lstm_1e_sp tdnn_opgru_1a_sp
 # System                tdnn_lstm_1e_sp tdnn_opgru_1a_sp
-# WER on train_dev(tg)      12.81     12.39
-#           [looped:]       12.93     12.32
-# WER on train_dev(fg)      11.92     11.39
-#           [looped:]       12.07     11.35
+# WER on train_dev(tg)      12.81     12.31
+#           [looped:]       12.93     12.26
+# WER on train_dev(fg)      11.92     11.60
+#           [looped:]       12.07     11.65
 # WER on eval2000(tg)        15.6      15.1
 #           [looped:]        16.0      15.1
-# WER on eval2000(fg)        14.1      13.6
+# WER on eval2000(fg)        14.1      13.5
 #           [looped:]        14.5      13.5
-# Final train prob         -0.065    -0.066
-# Final valid prob         -0.087    -0.085
-# Final train prob (xent)        -0.918    -0.889
-# Final valid prob (xent)       -1.0309   -0.9837
+# Final train prob         -0.065    -0.068
+# Final valid prob         -0.087    -0.091
+# Final train prob (xent)        -0.918    -0.879
+# Final valid prob (xent)       -1.0309   -0.9667
 
 
 
 set -e
 
 # configs for 'chain'
-stage=12
+stage=0
 train_stage=-10
 get_egs_stage=-10
 speed_perturb=true
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_opgru_1b.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_opgru_1b.sh
new file mode 100755
index 00000000000..579008b5658
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_opgru_1b.sh
@@ -0,0 +1,315 @@
+#!/bin/bash
+# Apache 2.0
+
+# This is based on TDNN_OPGRU_1A, but using the FastNormOPGRU to replace the NormPGRU.
+# For the details of OPGRU structure, please check the paper
+# "Output-Gate Projected Gated Recurrent Unit for Speech Recognition"
+# by Gaofeng Cheng et al,
+# http://www.danielpovey.com/files/2018_interspeech_opgru.pdf
+
+# Different from the vanilla OPGRU, Norm-OPGRU adds batchnorm in its output (forward direction)
+# and renorm in its recurrence. Experiments show that the TDNN-NormOPGRU could achieve similar
+# results than TDNN-LSTMP and BLSTMP in both large or small data sets (80 ~ 2300 Hrs).
+
+# ./local/chain/compare_wer_general.sh --looped tdnn_opgru_1a_sp tdnn_opgru_1b_sp
+# System                tdnn_opgru_1a_sp tdnn_opgru_1b_sp
+# WER on train_dev(tg)      12.31     12.41
+#           [looped:]       12.26     12.38
+# WER on train_dev(fg)      11.49     11.60
+#           [looped:]       11.43     11.65
+# WER on eval2000(tg)        14.9      15.1
+#           [looped:]        15.0      15.1
+# WER on eval2000(fg)        13.5      13.7
+#           [looped:]        13.5      13.7
+# Final train prob         -0.068    -0.070
+# Final valid prob         -0.091    -0.092
+# Final train prob (xent)        -0.879    -0.889
+# Final valid prob (xent)       -0.9667   -0.9723
+
+
+
+set -e
+
+# configs for 'chain'
+stage=0
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_opgru_1b # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+decode_dir_affix=
+
+# training options
+leftmost_questions_truncate=-1
+chunk_width=150
+chunk_left_context=40
+chunk_right_context=0
+xent_regularize=0.025
+self_repair_scale=0.00001
+label_delay=5
+dropout_schedule='0,0@0.20,0.2@0.50,0'
+# decode options
+extra_left_context=50
+extra_right_context=0
+frames_per_chunk=
+test_online_decoding=
+
+remove_egs=false
+common_egs_dir=
+
+affix=
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=$dir${affix:+_$affix}
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_7d_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  gru_opts="dropout-per-frame=true dropout-proportion=0.0 gru-nonlinearity-options=\"max-change=0.75\""
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 dim=1024
+  relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024
+  relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024
+
+  # check steps/libs/nnet3/xconfig/gru.py for the other options and defaults
+  fast-norm-opgru-layer name=opgru1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $gru_opts
+  relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024
+  relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024
+  fast-norm-opgru-layer name=opgru2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $gru_opts
+  relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024
+  relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024
+  fast-norm-opgru-layer name=opgru3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $gru_opts
+
+  ## adding the layers for chain branch
+  output-layer name=output input=opgru3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=opgru3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.num-chunk-per-minibatch 64 \
+    --trainer.frames-per-iter 1200000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --trainer.deriv-truncate-margin 8 \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $chunk_width \
+    --egs.chunk-left-context $chunk_left_context \
+    --egs.chunk-right-context $chunk_right_context \
+    --trainer.dropout-schedule $dropout_schedule \
+    --egs.chunk-left-context-initial 0 \
+    --egs.chunk-right-context-final 0 \
+    --egs.dir "$common_egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 15 ]; then
+  [ -z $extra_left_context ] && extra_left_context=$chunk_left_context;
+  [ -z $extra_right_context ] && extra_right_context=$chunk_right_context;
+  [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width;
+  iter_opts=
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in train_dev eval2000; do
+      (
+       steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk "$frames_per_chunk" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires \
+         $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+
+if $test_online_decoding && [ $stage -le 16 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+       --mfcc-config conf/mfcc_hires.conf \
+       $lang exp/nnet3/extractor $dir ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in train_dev eval2000; do
+    (
+      # note: we just give it "$decode_set" as it only uses the wav.scp, the
+      # feature type does not matter.
+      steps/online/nnet3/decode.sh --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+         $graph_dir data/${decode_set}_hires \
+         ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in online decoding"
+    exit 1
+  fi
+fi
+
+if [ $stage -le 17 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in train_dev eval2000; do
+    (
+      steps/nnet3/decode_looped.sh \
+         --acwt 1.0 --post-decode-acwt 10.0 \
+         --nj 50 --cmd "$decode_cmd" $iter_opts \
+         --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires \
+         $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg_looped || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg}_looped || exit 1;
+      fi
+      ) &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in looped decoding"
+    exit 1
+  fi
+fi
+
+wait;
+exit 0;
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/gru.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/gru.py
index 530ba14474a..2f387a6a1e5 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/gru.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/gru.py
@@ -1,6 +1,7 @@
 # Copyright 2016    Johns Hopkins University (Dan Povey)
 #           2017    Gaofeng Cheng (UCAS)
 #           2017    Lu Huang (THU)
+#           2018    Hang Lyu
 # Apache 2.0.
 
 
@@ -83,7 +84,7 @@ def get_full_config(self):
                 ans.append((config_name, line))
         return ans
 
-    # convenience function to generate the LSTM config
+    # convenience function to generate the GRU config
     def generate_gru_config(self):
 
         # assign some variables to reduce verbosity
@@ -468,7 +469,7 @@ def output_name(self, auxiliary_output = None):
     def output_dim(self, auxiliary_output = None):
         if auxiliary_output is not None:
             if auxiliary_output in self.auxiliary_outputs():
-                if node_name == 'c_t':
+                if node_name == 'h_t':
                     return self.config['cell-dim']
                 # add code for other auxiliary_outputs here when we decide to expose them
             else:
@@ -487,7 +488,7 @@ def get_full_config(self):
                 ans.append((config_name, line))
         return ans
 
-    # convenience function to generate the PGRU config
+    # convenience function to generate the Norm-PGRU config
     def generate_pgru_config(self):
 
         # assign some variables to reduce verbosity
@@ -711,7 +712,7 @@ def get_full_config(self):
                 ans.append((config_name, line))
         return ans
 
-    # convenience function to generate the PGRU config
+    # convenience function to generate the OPGRU config
     def generate_pgru_config(self):
 
         # assign some variables to reduce verbosity
@@ -922,7 +923,7 @@ def get_full_config(self):
                 ans.append((config_name, line))
         return ans
 
-    # convenience function to generate the PGRU config
+    # convenience function to generate the Norm-OPGRU config
     def generate_pgru_config(self):
 
         # assign some variables to reduce verbosity
@@ -1039,3 +1040,1072 @@ def generate_pgru_config(self):
         configs.append("component-node name={0}.s_t component={0}.s_r input={0}.s_t_preclip_renorm".format(name))
 
         return configs
+
+# This class is for lines like
+#   'fast-gru-layer name=gru1 input=[-1] delay=-3'
+# It generates an GRU sub-graph without output projections.
+# The output dimension of the layer may be specified via 'cell-dim=xxx', but if not specified,
+# the dimension defaults to the same as the input.
+# See other configuration values below.
+# decay-time is deprecated under GRU or PGRU, as I found the PGRUs do not need the decay-time option to get generalized to unseen sequence length
+#
+# Parameters of the class, and their defaults:
+#   input='[-1]'             [Descriptor giving the input of the layer.]
+#   cell-dim=-1              [Dimension of the cell]
+#   delay=-1                 [Delay in the recurrent connections of the GRU/LSTM ]
+#   clipping-threshold=30    [similar to LSTMs ,nnet3 GRUs use a gradient clipping component at the recurrent connections.
+#                             This is the threshold used to decide if clipping has to be activated ]
+#   zeroing-interval=20      [interval at which we (possibly) zero out the recurrent derivatives.]
+#   zeroing-threshold=15     [We only zero out the derivs every zeroing-interval, if derivs exceed this value.]
+#   self-repair-scale-nonlinearity=1e-5      [It is a constant scaling the self-repair vector computed in derived classes of NonlinearComponent]
+#                                       i.e.,  SigmoidComponent, TanhComponent and RectifiedLinearComponent ]
+#   ng-per-element-scale-options=''     [Additional options used for the diagonal matrices in the GRU/LSTM ]
+#   gru-nonlinearity-options=' max-change=0.75' [options for GruNonlinearityComponent, see below for detail]
+#   ng-affine-options=''                [Additional options used for the full matrices in the GRU/LSTM, can be used to do things like set biases to initialize to 1]
+class XconfigFastGruLayer(XconfigLayerBase):
+    def __init__(self, first_token, key_to_value, prev_names = None):
+        assert first_token == "fast-gru-layer"
+        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)
+
+    def set_default_configs(self):
+        self.config = {'input':'[-1]',
+                        'cell-dim' : -1, # this is a compulsory argument
+                        'clipping-threshold' : 30.0,
+                        'delay' : -1,
+                        'ng-per-element-scale-options' : ' max-change=0.75',
+                        'ng-affine-options' : ' max-change=0.75 ',
+                        'self-repair-scale-nonlinearity' : 0.00001,
+                        'zeroing-interval' : 20,
+                        'zeroing-threshold' : 15.0,
+                        # if you want to set 'self-repair-scale', ' self-repair-threshold'
+                        # or 'param-stddev' for GruNonlinearityComponent
+                        # For default, they are 1.0e-05, 0.2 and  1.0 / sqrt(d) where d is cell-dim.
+                        # you can add somethig like 'self-repair-scale=xxx' to gru-nonlinearity-options.
+                        # you can also see src/nnet3/nnet-combined-component.h for detail
+                        'gru-nonlinearity-options' : ' max-change=0.75'
+                        }
+
+    def set_derived_configs(self):
+        if self.config['cell-dim'] <= 0:
+            self.config['cell-dim'] = self.descriptors['input']['dim']
+
+    def check_configs(self):
+        key = 'cell-dim'
+        if self.config['cell-dim'] <= 0:
+            raise RuntimeError("cell-dim has invalid value {0}.".format(self.config[key]))
+
+        if self.config['delay'] == 0:
+            raise RuntimeError("delay cannot be zero")
+
+        for key in ['self-repair-scale-nonlinearity']:
+            if self.config[key] < 0.0 or self.config[key] > 1.0:
+                raise RuntimeError("{0} has invalid value {1}.".format(key, self.config[key]))
+
+    def output_name(self, auxiliary_output = None):
+        node_name = 'y_t'
+        return '{0}.{1}'.format(self.name, node_name)
+
+    def output_dim(self, auxiliary_output = None):
+        return self.config['cell-dim']
+
+    def get_full_config(self):
+        ans = []
+        config_lines = self.generate_gru_config()
+
+        for line in config_lines:
+            for config_name in ['ref', 'final']:
+                # we do not support user specified matrices in LSTM initialization
+                # so 'ref' and 'final' configs are the same.
+                ans.append((config_name, line))
+        return ans
+
+    # convenience function to generate the GRU config
+    def generate_gru_config(self):
+
+        # assign some variables to reduce verbosity
+        name = self.name
+        # in the below code we will just call descriptor_strings as descriptors for conciseness
+        input_dim = self.descriptors['input']['dim']
+        input_descriptor = self.descriptors['input']['final-string']
+        cell_dim = self.config['cell-dim']
+        delay = self.config['delay']
+        bptrunc_str = ("clipping-threshold={0}"
+                      " zeroing-threshold={1}"
+                      " zeroing-interval={2}"
+                      " recurrence-interval={3}"
+                      "".format(self.config['clipping-threshold'],
+                                self.config['zeroing-threshold'],
+                                self.config['zeroing-interval'], abs(delay)))
+        repair_nonlin = self.config['self-repair-scale-nonlinearity']
+        repair_nonlin_str = "self-repair-scale={0:.10f}".format(repair_nonlin) if repair_nonlin is not None else ''
+        affine_str = self.config['ng-affine-options']
+
+        # string for GruNonlinearityComponent
+        gru_nonlin_str = self.config['gru-nonlinearity-options']
+        
+        # formulation like:
+        # z_t = \sigmoid ( U^z x_t + W^z y_{t-1} )   # update gate
+        # r_t = \sigmoid ( U^r x_t + W^r y_{t-1} )   # reset gate
+        # h_t = \tanh ( U^h x_t + W^h ( y_{t-1} \dot r_t ) )
+        # y_t = ( 1 - z_t ) \dot h_t  +  z_t \dot y_{t-1}
+        # Note:
+        # naming convention:
+        # <layer-name>.W_<outputname>.<inputname> e.g. Gru1.W_i.xr for matrix
+        # providing output to gate i and operating on an appended vector [x,r]
+        # notation convention:
+        # In order to be consistent with the notations which are used in
+        # nnet-combined-component.cc, we map "\tilde{h_t}" and "h_t" which are
+        # used in paper to "h_t" and "c_t"
+
+        configs = []
+
+        configs.append("### Begin Gru layer '{0}'".format(name))
+        configs.append("# Update gate control : W_z* matrices")
+        configs.append("component name={0}.W_z.xh type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + cell_dim, cell_dim, affine_str))
+        configs.append("# Reset gate control : W_r* matrices")
+        configs.append("component name={0}.W_r.xh type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + cell_dim, cell_dim, affine_str))
+
+        configs.append("# hpart_t related matrix : W_hpart matrice")
+        configs.append("component name={0}.W_hpart.x type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim, cell_dim , affine_str))
+        
+        configs.append("# Defining the non-linearities for z_t and r_t")
+        configs.append("component name={0}.z type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
+        configs.append("component name={0}.r type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
+        
+        recurrent_connection = '{0}.s_t'.format(name)
+
+        configs.append("# z_t")
+        configs.append("component-node name={0}.z_t_pre component={0}.W_z.xh input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay))
+        configs.append("component-node name={0}.z_t component={0}.z input={0}.z_t_pre".format(name))
+        configs.append("# r_t")
+        configs.append("component-node name={0}.r_t_pre component={0}.W_r.xh input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay))
+        configs.append("component-node name={0}.r_t component={0}.r input={0}.r_t_pre".format(name))
+
+        configs.append("# hpart_t")
+        configs.append("component-node name={0}.hpart_t component={0}.W_hpart.x input={1}".format(name, input_descriptor))
+        
+        configs.append("# y_t")
+        configs.append("# Note: the output of GruNonlinearityComponent is (h_t, c_t), we just get the second half. Otherwise, in non-projection gru layer, y_t = c_t")
+        configs.append("component name={0}.gru_nonlin type=GruNonlinearityComponent cell-dim={1} {2}".format(name, cell_dim, gru_nonlin_str))
+        configs.append("component-node name={0}.gru_nonlin_t component={0}.gru_nonlin input=Append({0}.z_t, {0}.r_t, {0}.hpart_t, IfDefined(Offset({1}, {2})))".format(name, recurrent_connection, delay))
+        configs.append("dim-range-node name={0}.y_t input-node={0}.gru_nonlin_t dim-offset={1} dim={1}".format(name, cell_dim))
+
+        configs.append("# s_t : recurrence")
+        configs.append("# Note: in non-projection gru layer, the recurrent part equals the output, namely y_t.")
+        configs.append("component name={0}.s_r type=BackpropTruncationComponent dim={1} {2}".format(name, cell_dim, bptrunc_str))
+        configs.append("component-node name={0}.s_t component={0}.s_r input={0}.y_t".format(name))
+        return configs
+
+
+# This class is for lines like
+#   'fast-pgru-layer name=pgru1 input=[-1] delay=-3'
+# It generates an PGRU sub-graph with output projections. It can also generate
+# outputs without projection, but you could use the XconfigGruLayer for this
+# simple RNN.
+# The output dimension of the layer may be specified via 'cell-dim=xxx', but if not specified,
+# the dimension defaults to the same as the input.
+# See other configuration values below.
+#
+# Parameters of the class, and their defaults:
+#   input='[-1]'             [Descriptor giving the input of the layer.]
+#   cell-dim=-1            [Dimension of the cell]
+#   recurrent-projection_dim [Dimension of the projection used in recurrent connections, e.g. cell-dim/4]
+#   non-recurrent-projection-dim   [Dimension of the projection in non-recurrent connections,
+#                                   in addition to recurrent-projection-dim, e.g. cell-dim/4]
+#   delay=-1                 [Delay in the recurrent connections of the GRU ]
+#   clipping-threshold=30    [nnet3 GRU use a gradient clipping component at the recurrent connections.
+#                             This is the threshold used to decide if clipping has to be activated ]
+#   zeroing-interval=20      [interval at which we (possibly) zero out the recurrent derivatives.]
+#   zeroing-threshold=15     [We only zero out the derivs every zeroing-interval, if derivs exceed this value.]
+#   self_repair_scale_nonlinearity=1e-5      [It is a constant scaling the self-repair vector computed in derived classes of NonlinearComponent]
+#                                       i.e.,  SigmoidComponent, TanhComponent and RectifiedLinearComponent ]
+#   ng-per-element-scale-options=''   [Additional options used for the diagonal matrices in the GRU ]
+#   gru-nonlinearity-options=' max-change=0.75' [options for GruNonlinearityComponent, see below for detail]
+#   ng-affine-options=''              [Additional options used for the full matrices in the GRU, can be used to do things like set biases to initialize to 1]
+class XconfigFastPgruLayer(XconfigLayerBase):
+    def __init__(self, first_token, key_to_value, prev_names = None):
+        assert first_token == "fast-pgru-layer"
+        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)
+
+    def set_default_configs(self):
+        self.config = {'input' : '[-1]',
+                        'cell-dim' : -1, # this is a compulsory argument
+                        'recurrent-projection-dim' : -1,  # defaults to cell-dim / 4
+                        'non-recurrent-projection-dim' : -1, # defaults to
+                                                             # recurrent-projection-dim
+                        'clipping-threshold' : 30.0,
+                        'delay' : -1,
+                        'ng-per-element-scale-options' : ' max-change=0.75 ',
+                        'ng-affine-options' : ' max-change=0.75 ',
+                        'self-repair-scale-nonlinearity' : 0.00001,
+                        'zeroing-interval' : 20,
+                        'zeroing-threshold' : 15.0,
+                        # if you want to set 'self-repair-scale', ' self-repair-threshold'
+                        # or 'param-stddev' for GruNonlinearityComponent
+                        # For default, they are 1.0e-05, 0.2 and  1.0 / sqrt(d) where d is cell-dim.
+                        # you can add somethig like 'self-repair-scale=xxx' to gru-nonlinearity-options.
+                        # you can also see src/nnet3/nnet-combined-component.h for detail
+                        'gru-nonlinearity-options' : ' max-change=0.75'
+                       }
+
+    def set_derived_configs(self):
+        if self.config['recurrent-projection-dim'] <= 0:
+            self.config['recurrent-projection-dim'] = self.config['cell-dim'] / 4
+
+        if self.config['non-recurrent-projection-dim'] <= 0:
+            self.config['non-recurrent-projection-dim'] = \
+               self.config['recurrent-projection-dim']
+
+    def check_configs(self):
+        for key in ['cell-dim', 'recurrent-projection-dim',
+                    'non-recurrent-projection-dim']:
+            if self.config[key] <= 0:
+                raise RuntimeError("{0} has invalid value {1}.".format(
+                    key, self.config[key]))
+
+        if self.config['delay'] == 0:
+            raise RuntimeError("delay cannot be zero")
+
+        if (self.config['recurrent-projection-dim'] +
+            self.config['non-recurrent-projection-dim'] >
+            self.config['cell-dim']):
+            raise RuntimeError("recurrent+non-recurrent projection dim exceeds "
+                                "cell dim.")
+        for key in ['self-repair-scale-nonlinearity']:
+            if self.config[key] < 0.0 or self.config[key] > 1.0:
+                raise RuntimeError("{0} has invalid value {2}."
+                                   .format(self.layer_type, key,
+                                           self.config[key]))
+
+    def auxiliary_outputs(self):
+        return ['c_t']
+
+    def output_name(self, auxiliary_output = None):
+        node_name = 'y_t'
+        if auxiliary_output is not None:
+            if auxiliary_output in self.auxiliary_outputs():
+                node_name = auxiliary_output
+            else:
+                raise Exception("In {0} of type {1}, unknown auxiliary output name {1}".format(self.layer_type, auxiliary_output))
+
+        return '{0}.{1}'.format(self.name, node_name)
+
+    def output_dim(self, auxiliary_output = None):
+        if auxiliary_output is not None:
+            if auxiliary_output in self.auxiliary_outputs():
+                if node_name == 'c_t':
+                    return self.config['cell-dim']
+                # add code for other auxiliary_outputs here when we decide to expose them
+            else:
+                raise Exception("In {0} of type {1}, unknown auxiliary output name {1}".format(self.layer_type, auxiliary_output))
+
+        return self.config['recurrent-projection-dim'] + self.config['non-recurrent-projection-dim']
+
+    def get_full_config(self):
+        ans = []
+        config_lines = self.generate_pgru_config()
+
+        for line in config_lines:
+            for config_name in ['ref', 'final']:
+                # we do not support user specified matrices in LSTM initialization
+                # so 'ref' and 'final' configs are the same.
+                ans.append((config_name, line))
+        return ans
+
+    # convenience function to generate the PGRU config
+    def generate_pgru_config(self):
+
+        # assign some variables to reduce verbosity
+        name = self.name
+        # in the below code we will just call descriptor_strings as descriptors for conciseness
+        input_dim = self.descriptors['input']['dim']
+        input_descriptor = self.descriptors['input']['final-string']
+        cell_dim = self.config['cell-dim']
+        rec_proj_dim = self.config['recurrent-projection-dim']
+        nonrec_proj_dim = self.config['non-recurrent-projection-dim']
+        delay = self.config['delay']
+        repair_nonlin = self.config['self-repair-scale-nonlinearity']
+        repair_nonlin_str = "self-repair-scale={0:.10f}".format(repair_nonlin) if repair_nonlin is not None else ''
+        bptrunc_str = ("clipping-threshold={0}"
+                      " zeroing-threshold={1}"
+                      " zeroing-interval={2}"
+                      " recurrence-interval={3}"
+                      "".format(self.config['clipping-threshold'],
+                                self.config['zeroing-threshold'],
+                                self.config['zeroing-interval'],
+                                abs(delay)))
+        affine_str = self.config['ng-affine-options']
+        pes_str = self.config['ng-per-element-scale-options']
+
+        # Natural gradient per element scale parameters
+        # TODO: decide if we want to keep exposing these options
+        if re.search('param-mean', pes_str) is None and \
+           re.search('param-stddev', pes_str) is None:
+           pes_str += " param-mean=0.0 param-stddev=1.0 "
+
+        # string for GruNonlinearityComponent
+        gru_nonlin_str = self.config['gru-nonlinearity-options']
+        
+        # formulation like:
+        # z_t = \sigmoid ( U^z x_t + W^z s_{t-1} )   # update gate
+        # r_t = \sigmoid ( U^r x_t + W^r s_{t-1} )   # reset gate
+        # h_t = \tanh ( U^h x_t + W^h ( s_{t-1} \dot r_t ) )
+        # c_t = ( 1 - z_t ) \dot h_t  +  z_t \dot c_{t-1}
+        # y_t = W^y c_t  # dim(y_t) = recurrent_dim + non_recurrent_dim.
+                         #  This is the output of the GRU.
+        # s_t = y_t[0:recurrent_dim-1]  # dimension range of y_t 
+                                        # dim(s_t) = recurrent_dim.
+        # Note:
+        # naming convention:
+        # <layer-name>.W_<outputname>.<inputname> e.g. Gru1.W_i.xr for matrix
+        # providing output to gate i and operating on an appended vector [x,r]
+        # notation convention:
+        # In order to be consistent with the notations which are used in
+        # nnet-combined-component.cc, we map "\tilde{h_t}" and "h_t" which are
+        # used in paper to "h_t" and "c_t"
+
+        configs = []
+        configs.append("### Begin Gru layer '{0}'".format(name))
+        configs.append("# Update gate control : W_z* matrices")
+        configs.append("component name={0}.W_z.xs type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + rec_proj_dim, cell_dim, affine_str))
+        configs.append("# Reset gate control : W_r* matrices")
+        configs.append("component name={0}.W_r.xs type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + rec_proj_dim, rec_proj_dim, affine_str))
+
+
+        configs.append("# hpart_t related matrix : W_hpart matric")
+        configs.append("component name={0}.W_hpart.x type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim, cell_dim , affine_str))
+        
+        configs.append("# Defining the non-linearities")
+        configs.append("component name={0}.z type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
+        configs.append("component name={0}.r type=SigmoidComponent dim={1} {2}".format(name, rec_proj_dim, repair_nonlin_str))
+        
+        recurrent_connection = '{0}.s_t'.format(name)
+
+        configs.append("# z_t and r_t")
+        configs.append("component-node name={0}.z_t_pre component={0}.W_z.xs input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay))
+        configs.append("component-node name={0}.z_t component={0}.z input={0}.z_t_pre".format(name))
+        configs.append("component-node name={0}.r_t_pre component={0}.W_r.xs input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay))
+        configs.append("component-node name={0}.r_t component={0}.r input={0}.r_t_pre".format(name))
+
+        configs.append("# hpart_t")
+        configs.append("component-node name={0}.hpart_t component={0}.W_hpart.x input={1}".format(name, input_descriptor))
+        
+        configs.append("# c_t")
+        configs.append("# Note: the output of GruNonlinearityComponent is (h_t, c_t), we use the second half.")
+        configs.append("component name={0}.gru_nonlin type=GruNonlinearityComponent cell-dim={1} recurrent-dim={2} {3}".format(name, cell_dim, rec_proj_dim, gru_nonlin_str))
+        configs.append("component-node name={0}.gru_nonlin_t component={0}.gru_nonlin input=Append({0}.z_t, {0}.r_t, {0}.hpart_t, IfDefined(Offset({0}.c_t, {2})), IfDefined(Offset({1}, {2})))".format(name, recurrent_connection, delay))
+        configs.append("dim-range-node name={0}.c_t input-node={0}.gru_nonlin_t dim-offset={1} dim={1}".format(name, cell_dim))
+
+        configs.append("# the projected matrix W_y.c and y_t")
+        configs.append("component name={0}.W_y.c type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, cell_dim, rec_proj_dim + nonrec_proj_dim, affine_str))
+        configs.append("component-node name={0}.y_t component={0}.W_y.c input={0}.c_t".format(name))
+
+        configs.append("# s_t : recurrence")
+        configs.append("component name={0}.s_r type=BackpropTruncationComponent dim={1} {2}".format(name, rec_proj_dim, bptrunc_str))
+        configs.append("dim-range-node name={0}.s_t_pre input-node={0}.y_t dim-offset=0 dim={1}".format(name, rec_proj_dim))
+        configs.append("component-node name={0}.s_t component={0}.s_r input={0}.s_t_pre".format(name))
+        return configs
+
+
+# This class is for lines like
+#   'fast-norm-pgru-layer name=pgru1 input=[-1] delay=-3'
+
+# Different from the vanilla PGRU, the NormPGRU uses batchnorm in the forward direction
+# and renorm in the recurrence.
+
+# The output dimension of the layer may be specified via 'cell-dim=xxx', but if not specified,
+# the dimension defaults to the same as the input.
+# See other configuration values below.
+#
+# Parameters of the class, and their defaults:
+#   input='[-1]'             [Descriptor giving the input of the layer.]
+#   cell-dim=-1            [Dimension of the cell]
+#   recurrent-projection_dim [Dimension of the projection used in recurrent connections, e.g. cell-dim/4]
+#   non-recurrent-projection-dim   [Dimension of the projection in non-recurrent connections,
+#                                   in addition to recurrent-projection-dim, e.g. cell-dim/4]
+#   delay=-1                 [Delay in the recurrent connections of the GRU ]
+#   clipping-threshold=30    [nnet3 GRU use a gradient clipping component at the recurrent connections.
+#                             This is the threshold used to decide if clipping has to be activated ]
+#   zeroing-interval=20      [interval at which we (possibly) zero out the recurrent derivatives.]
+#   zeroing-threshold=15     [We only zero out the derivs every zeroing-interval, if derivs exceed this value.]
+#   self_repair_scale_nonlinearity=1e-5      [It is a constant scaling the self-repair vector computed in derived classes of NonlinearComponent]
+#                                       i.e.,  SigmoidComponent, TanhComponent and RectifiedLinearComponent ]
+#   ng-per-element-scale-options=''   [Additional options used for the diagonal matrices in the GRU ]
+#   gru-nonlinearity-options=' max-change=0.75' [options for GruNonlinearityComponent, see below for detail]
+#   ng-affine-options=''              [Additional options used for the full matrices in the GRU, can be used to do things like set biases to initialize to 1]
+class XconfigFastNormPgruLayer(XconfigLayerBase):
+    def __init__(self, first_token, key_to_value, prev_names = None):
+        assert first_token == "fast-norm-pgru-layer"
+        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)
+
+    def set_default_configs(self):
+        self.config = {'input' : '[-1]',
+                        'cell-dim' : -1, # this is a compulsory argument
+                        'recurrent-projection-dim' : -1,  # defaults to cell-dim / 4
+                        'non-recurrent-projection-dim' : -1, # defaults to
+                                                             # recurrent-projection-dim
+                        'clipping-threshold' : 30.0,
+                        'delay' : -1,
+                        'ng-per-element-scale-options' : ' max-change=0.75 ',
+                        'ng-affine-options' : ' max-change=0.75 ',
+                        'self-repair-scale-nonlinearity' : 0.00001,
+                        'zeroing-interval' : 20,
+                        'zeroing-threshold' : 15.0,
+                        # if you want to set 'self-repair-scale', ' self-repair-threshold'
+                        # or 'param-stddev' for GruNonlinearityComponent
+                        # For default, they are 1.0e-05, 0.2 and  1.0 / sqrt(d) where d is cell-dim.
+                        # you can add somethig like 'self-repair-scale=xxx' to gru-nonlinearity-options.
+                        # you can also see src/nnet3/nnet-combined-component.h for detail
+                        'gru-nonlinearity-options' : ' max-change=0.75',
+                        'dropout-proportion' : -1.0,  # If -1.0, no dropout components will be added
+                        'dropout-per-frame' : True  # If False, regular dropout, not per frame
+                       }
+
+    def set_derived_configs(self):
+        if self.config['recurrent-projection-dim'] <= 0:
+            self.config['recurrent-projection-dim'] = self.config['cell-dim'] / 4
+
+        if self.config['non-recurrent-projection-dim'] <= 0:
+            self.config['non-recurrent-projection-dim'] = \
+               self.config['recurrent-projection-dim']
+
+    def check_configs(self):
+        for key in ['cell-dim', 'recurrent-projection-dim',
+                    'non-recurrent-projection-dim']:
+            if self.config[key] <= 0:
+                raise RuntimeError("{0} has invalid value {1}.".format(
+                    key, self.config[key]))
+
+        if self.config['delay'] == 0:
+            raise RuntimeError("delay cannot be zero")
+
+        if (self.config['recurrent-projection-dim'] +
+            self.config['non-recurrent-projection-dim'] >
+            self.config['cell-dim']):
+            raise RuntimeError("recurrent+non-recurrent projection dim exceeds "
+                                "cell dim.")
+        for key in ['self-repair-scale-nonlinearity']:
+            if self.config[key] < 0.0 or self.config[key] > 1.0:
+                raise RuntimeError("{0} has invalid value {2}."
+                                   .format(self.layer_type, key,
+                                           self.config[key]))
+        if ((self.config['dropout-proportion'] > 1.0 or
+             self.config['dropout-proportion'] < 0.0) and
+             self.config['dropout-proportion'] != -1.0 ):
+             raise RuntimeError("dropout-proportion has invalid value {0}."
+                                .format(self.config['dropout-proportion']))
+
+    def auxiliary_outputs(self):
+        return ['c_t']
+
+    def output_name(self, auxiliary_output = None):
+        node_name = 'y_t'
+        if auxiliary_output is not None:
+            if auxiliary_output in self.auxiliary_outputs():
+                node_name = auxiliary_output
+            else:
+                raise Exception("In {0} of type {1}, unknown auxiliary output name {1}".format(self.layer_type, auxiliary_output))
+
+        return '{0}.{1}'.format(self.name, node_name)
+
+    def output_dim(self, auxiliary_output = None):
+        if auxiliary_output is not None:
+            if auxiliary_output in self.auxiliary_outputs():
+                if node_name == 'c_t':
+                    return self.config['cell-dim']
+                # add code for other auxiliary_outputs here when we decide to expose them
+            else:
+                raise Exception("In {0} of type {1}, unknown auxiliary output name {1}".format(self.layer_type, auxiliary_output))
+
+        return self.config['recurrent-projection-dim'] + self.config['non-recurrent-projection-dim']
+
+    def get_full_config(self):
+        ans = []
+        config_lines = self.generate_pgru_config()
+
+        for line in config_lines:
+            for config_name in ['ref', 'final']:
+                # we do not support user specified matrices in LSTM initialization
+                # so 'ref' and 'final' configs are the same.
+                ans.append((config_name, line))
+        return ans
+
+    # convenience function to generate the Norm-PGRU config
+    def generate_pgru_config(self):
+
+        # assign some variables to reduce verbosity
+        name = self.name
+        # in the below code we will just call descriptor_strings as descriptors for conciseness
+        input_dim = self.descriptors['input']['dim']
+        input_descriptor = self.descriptors['input']['final-string']
+        cell_dim = self.config['cell-dim']
+        rec_proj_dim = self.config['recurrent-projection-dim']
+        nonrec_proj_dim = self.config['non-recurrent-projection-dim']
+        delay = self.config['delay']
+        repair_nonlin = self.config['self-repair-scale-nonlinearity']
+        repair_nonlin_str = "self-repair-scale={0:.10f}".format(repair_nonlin) if repair_nonlin is not None else ''
+        bptrunc_str = ("clipping-threshold={0}"
+                      " zeroing-threshold={1}"
+                      " zeroing-interval={2}"
+                      " recurrence-interval={3}"
+                      "".format(self.config['clipping-threshold'],
+                                self.config['zeroing-threshold'],
+                                self.config['zeroing-interval'],
+                                abs(delay)))
+        affine_str = self.config['ng-affine-options']
+        pes_str = self.config['ng-per-element-scale-options']
+        dropout_proportion = self.config['dropout-proportion']
+        dropout_per_frame = 'true' if self.config['dropout-per-frame'] else 'false' 
+
+        # Natural gradient per element scale parameters
+        # TODO: decide if we want to keep exposing these options
+        if re.search('param-mean', pes_str) is None and \
+           re.search('param-stddev', pes_str) is None:
+           pes_str += " param-mean=0.0 param-stddev=1.0 "
+
+        # string for GruNonlinearityComponent
+        gru_nonlin_str = self.config['gru-nonlinearity-options']
+        
+        # formulation like:
+        # z_t = \sigmoid ( U^z x_t + W^z s_{t-1} )   # update gate
+        # r_t = \sigmoid ( U^r x_t + W^r s_{t-1} )   # reset gate
+        # h_t = \tanh ( U^h x_t + W^h ( s_{t-1} \dot r_t ) )
+        # c_t = ( 1 - z_t ) \dot h_t  +  z_t \dot c_{t-1}
+        # y_t_tmp = W^y c_t
+        # s_t = renorm ( y_t_tmp[0:rec_proj_dim-1] ) # dim(s_t) = recurrent_dim.
+        # y_t = batchnorm ( y_t_tmp )  # dim(y_t) = recurrent_dim + non_recurrent_dim.
+                                       # This is the output of the GRU.
+        # Note:
+        # naming convention:
+        # <layer-name>.W_<outputname>.<inputname> e.g. Gru1.W_i.xr for matrix
+        # providing output to gate i and operating on an appended vector [x,r]
+        # notation convention:
+        # In order to be consistent with the notations which are used in
+        # nnet-combined-component.cc, we map "\tilde{h_t}" and "h_t" which are
+        # used in paper to "h_t" and "c_t"
+
+        configs = []
+        configs.append("### Begin Gru layer '{0}'".format(name))
+        configs.append("# Update gate control : W_z* matrices")
+        configs.append("component name={0}.W_z.xs type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + rec_proj_dim, cell_dim, affine_str))
+        configs.append("# Reset gate control : W_r* matrices")
+        configs.append("component name={0}.W_r.xs type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + rec_proj_dim, rec_proj_dim, affine_str))
+
+
+        configs.append("# hpart_t related matrix : W_hpart matric")
+        configs.append("component name={0}.W_hpart.x type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim, cell_dim , affine_str))
+        
+        configs.append("# Defining the non-linearities")
+        configs.append("component name={0}.z type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
+        configs.append("component name={0}.r type=SigmoidComponent dim={1} {2}".format(name, rec_proj_dim, repair_nonlin_str))
+
+        if dropout_proportion != -1.0:
+            configs.append("# Defining the dropout component")
+            configs.append("component name={0}.dropout_z type=DropoutComponent dim={1} "
+                           "dropout-proportion={2} dropout-per-frame={3}"
+                           .format(name, cell_dim, dropout_proportion, dropout_per_frame))
+            configs.append("component name={0}.dropout_r type=DropoutComponent dim={1} "
+                           "dropout-proportion={2} dropout-per-frame={3}"
+                           .format(name, rec_proj_dim, dropout_proportion, dropout_per_frame))
+
+
+        recurrent_connection = '{0}.s_t'.format(name)
+
+        configs.append("# z_t")
+        configs.append("component-node name={0}.z_t_pre component={0}.W_z.xs input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay))
+        if dropout_proportion != -1.0:
+            configs.append("component-node name={0}.z_t_predrop component={0}.z input={0}.z_t_pre".format(name))
+            configs.append("component-node name={0}.z_t component={0}.dropout_z input={0}.z_t_predrop".format(name))
+        else:
+            configs.append("component-node name={0}.z_t component={0}.z input={0}.z_t_pre".format(name))
+
+        configs.append("# r_t")
+        configs.append("component-node name={0}.r_t_pre component={0}.W_r.xs input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay))
+        if dropout_proportion != -1.0:
+            configs.append("component-node name={0}.r_t_predrop component={0}.r input={0}.r_t_pre".format(name))
+            configs.append("component-node name={0}.r_t component={0}.dropout_r input={0}.r_t_predrop".format(name))
+        else:
+            configs.append("component-node name={0}.r_t component={0}.r input={0}.r_t_pre".format(name))
+
+        configs.append("# hpart_t")
+        configs.append("component-node name={0}.hpart_t component={0}.W_hpart.x input={1}".format(name, input_descriptor))
+        
+        configs.append("# c_t")
+        configs.append("# Note: the output of GruNonlinearityComponent is (h_t, c_t), we use the second half.")
+        configs.append("component name={0}.gru_nonlin type=GruNonlinearityComponent cell-dim={1} recurrent-dim={2} {3}".format(name, cell_dim, rec_proj_dim, gru_nonlin_str))
+        configs.append("component-node name={0}.gru_nonlin_t component={0}.gru_nonlin input=Append({0}.z_t, {0}.r_t, {0}.hpart_t, IfDefined(Offset({0}.c_t, {2})), IfDefined(Offset({1}, {2})))".format(name, recurrent_connection, delay))
+        configs.append("dim-range-node name={0}.c_t input-node={0}.gru_nonlin_t dim-offset={1} dim={1}".format(name, cell_dim))
+
+        configs.append("# the projected matrix W_y.c and y_t_tmp")
+        configs.append("component name={0}.W_y.c type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, cell_dim, rec_proj_dim + nonrec_proj_dim, affine_str))
+        configs.append("component-node name={0}.y_t_tmp component={0}.W_y.c input={0}.c_t".format(name))
+
+        configs.append("# s_t : recurrence")
+        configs.append("component name={0}.renorm type=NormalizeComponent dim={1} target-rms=1.0".format(name, rec_proj_dim))
+        configs.append("component name={0}.s_r type=BackpropTruncationComponent dim={1} {2}".format(name, rec_proj_dim, bptrunc_str))
+        configs.append("dim-range-node name={0}.s_t_pre input-node={0}.y_t_tmp dim-offset=0 dim={1}".format(name, rec_proj_dim))
+        configs.append("component-node name={0}.s_t_renorm component={0}.renorm input={0}.s_t_pre".format(name))
+        configs.append("component-node name={0}.s_t component={0}.s_r input={0}.s_t_renorm".format(name))
+
+        configs.append("# y_t : output")
+        configs.append("component name={0}.batchnorm type=BatchNormComponent dim={1} target-rms=1.0".format(name, rec_proj_dim + nonrec_proj_dim))
+        configs.append("component-node name={0}.y_t component={0}.batchnorm input={0}.y_t_tmp".format(name))
+        return configs
+
+
+# This class is for lines like
+#   'fast-opgru-layer name=opgru1 input=[-1] delay=-3'
+# It generates an PGRU sub-graph with output projections. It can also generate
+# outputs without projection, but you could use the XconfigGruLayer for this
+# simple RNN.
+# The output dimension of the layer may be specified via 'cell-dim=xxx', but if not specified,
+# the dimension defaults to the same as the input.
+# See other configuration values below.
+#
+# Parameters of the class, and their defaults:
+#   input='[-1]'             [Descriptor giving the input of the layer.]
+#   cell-dim=-1            [Dimension of the cell]
+#   recurrent-projection_dim [Dimension of the projection used in recurrent connections, e.g. cell-dim/4]
+#   non-recurrent-projection-dim   [Dimension of the projection in non-recurrent connections,
+#                                   in addition to recurrent-projection-dim, e.g. cell-dim/4]
+#   delay=-1                 [Delay in the recurrent connections of the GRU ]
+#   clipping-threshold=30    [nnet3 GRU use a gradient clipping component at the recurrent connections.
+#                             This is the threshold used to decide if clipping has to be activated ]
+#   zeroing-interval=20      [interval at which we (possibly) zero out the recurrent derivatives.]
+#   zeroing-threshold=15     [We only zero out the derivs every zeroing-interval, if derivs exceed this value.]
+#   self_repair_scale_nonlinearity=1e-5      [It is a constant scaling the self-repair vector computed in derived classes of NonlinearComponent]
+#                                       i.e.,  SigmoidComponent, TanhComponent and RectifiedLinearComponent ]
+#   ng-per-element-scale-options=''   [Additional options used for the diagonal matrices in the GRU ]
+#   gru-nonlinearity-options=' max-change=0.75' [options for GruNonlinearityComponent, see below for detail]
+#   ng-affine-options=''              [Additional options used for the full matrices in the GRU, can be used to do things like set biases to initialize to 1]
+class XconfigFastOpgruLayer(XconfigLayerBase):
+    def __init__(self, first_token, key_to_value, prev_names = None):
+        assert first_token == "fast-opgru-layer"
+        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)
+
+    def set_default_configs(self):
+        self.config = {'input' : '[-1]',
+                        'cell-dim' : -1, # this is a compulsory argument
+                        'recurrent-projection-dim' : -1,  # defaults to cell-dim / 4
+                        'non-recurrent-projection-dim' : -1, # defaults to
+                                                             # recurrent-projection-dim
+                        'clipping-threshold' : 30.0,
+                        'delay' : -1,
+                        'ng-per-element-scale-options' : ' max-change=0.75 ',
+                        'ng-affine-options' : ' max-change=0.75 ',
+                        'self-repair-scale-nonlinearity' : 0.00001,
+                        'zeroing-interval' : 20,
+                        'zeroing-threshold' : 15.0,
+                        # if you want to set 'self-repair-scale', ' self-repair-threshold'
+                        # or 'param-stddev' for GruNonlinearityComponent
+                        # For default, they are 1.0e-05, 0.2 and  1.0 / sqrt(d) where d is cell-dim.
+                        # you can add somethig like 'self-repair-scale=xxx' to gru-nonlinearity-options.
+                        # you can also see src/nnet3/nnet-combined-component.h for detail
+                        'gru-nonlinearity-options' : ' max-change=0.75'
+                       }
+
+    def set_derived_configs(self):
+        if self.config['recurrent-projection-dim'] <= 0:
+            self.config['recurrent-projection-dim'] = self.config['cell-dim'] / 4
+
+        if self.config['non-recurrent-projection-dim'] <= 0:
+            self.config['non-recurrent-projection-dim'] = \
+               self.config['recurrent-projection-dim']
+
+    def check_configs(self):
+        for key in ['cell-dim', 'recurrent-projection-dim',
+                    'non-recurrent-projection-dim']:
+            if self.config[key] <= 0:
+                raise RuntimeError("{0} has invalid value {1}.".format(
+                    key, self.config[key]))
+
+        if self.config['delay'] == 0:
+            raise RuntimeError("delay cannot be zero")
+
+        if (self.config['recurrent-projection-dim'] +
+            self.config['non-recurrent-projection-dim'] >
+            self.config['cell-dim']):
+            raise RuntimeError("recurrent+non-recurrent projection dim exceeds "
+                                "cell dim.")
+        for key in ['self-repair-scale-nonlinearity']:
+            if self.config[key] < 0.0 or self.config[key] > 1.0:
+                raise RuntimeError("{0} has invalid value {2}."
+                                   .format(self.layer_type, key,
+                                           self.config[key]))
+
+    def auxiliary_outputs(self):
+        return ['c_t']
+
+    def output_name(self, auxiliary_output = None):
+        node_name = 'y_t'
+        if auxiliary_output is not None:
+            if auxiliary_output in self.auxiliary_outputs():
+                node_name = auxiliary_output
+            else:
+                raise Exception("In {0} of type {1}, unknown auxiliary output name {1}".format(self.layer_type, auxiliary_output))
+
+        return '{0}.{1}'.format(self.name, node_name)
+
+    def output_dim(self, auxiliary_output = None):
+        if auxiliary_output is not None:
+            if auxiliary_output in self.auxiliary_outputs():
+                if node_name == 'c_t':
+                    return self.config['cell-dim']
+                # add code for other auxiliary_outputs here when we decide to expose them
+            else:
+                raise Exception("In {0} of type {1}, unknown auxiliary output name {1}".format(self.layer_type, auxiliary_output))
+
+        return self.config['recurrent-projection-dim'] + self.config['non-recurrent-projection-dim']
+
+    def get_full_config(self):
+        ans = []
+        config_lines = self.generate_pgru_config()
+
+        for line in config_lines:
+            for config_name in ['ref', 'final']:
+                # we do not support user specified matrices in LSTM initialization
+                # so 'ref' and 'final' configs are the same.
+                ans.append((config_name, line))
+        return ans
+
+    # convenience function to generate the OPGRU config
+    def generate_pgru_config(self):
+
+        # assign some variables to reduce verbosity
+        name = self.name
+        # in the below code we will just call descriptor_strings as descriptors for conciseness
+        input_dim = self.descriptors['input']['dim']
+        input_descriptor = self.descriptors['input']['final-string']
+        cell_dim = self.config['cell-dim']
+        rec_proj_dim = self.config['recurrent-projection-dim']
+        nonrec_proj_dim = self.config['non-recurrent-projection-dim']
+        delay = self.config['delay']
+        repair_nonlin = self.config['self-repair-scale-nonlinearity']
+        repair_nonlin_str = "self-repair-scale={0:.10f}".format(repair_nonlin) if repair_nonlin is not None else ''
+        bptrunc_str = ("clipping-threshold={0}"
+                      " zeroing-threshold={1}"
+                      " zeroing-interval={2}"
+                      " recurrence-interval={3}"
+                      "".format(self.config['clipping-threshold'],
+                                self.config['zeroing-threshold'],
+                                self.config['zeroing-interval'],
+                                abs(delay)))
+        affine_str = self.config['ng-affine-options']
+        pes_str = self.config['ng-per-element-scale-options']
+
+        # Natural gradient per element scale parameters
+        # TODO: decide if we want to keep exposing these options
+        if re.search('param-mean', pes_str) is None and \
+           re.search('param-stddev', pes_str) is None:
+           pes_str += " param-mean=0.0 param-stddev=1.0 "
+
+        # string for GruNonlinearityComponent
+        gru_nonlin_str = self.config['gru-nonlinearity-options']
+        
+        # formulation like:
+        # z_t = \sigmoid ( U^z x_t + W^z s_{t-1} )   # update gate
+        # o_t = \sigmoid ( U^o x_t + W^o s_{t-1} )   # reset gate
+        # h_t = \tanh ( U^h x_t + W^h \dot c_{t-1} )
+        # c_t = ( 1 - z_t ) \dot h_t  +  z_t \dot c_{t-1}
+        # y_t = ( c_t \dot o_t ) W^y  # dim(y_t) = recurrent_dim + non_recurrent_dim.
+                                      #  This is the output of the GRU.
+        # s_t = y_t[0:recurrent_dim-1]  # dimension range of y_t 
+                                        # dim(s_t) = recurrent_dim.
+        # Note:
+        # naming convention:
+        # <layer-name>.W_<outputname>.<inputname> e.g. Gru1.W_i.xr for matrix
+        # providing output to gate i and operating on an appended vector [x,r]
+        # notation convention:
+        # In order to be consistent with the notations which are used in
+        # nnet-combined-component.cc, we map "\tilde{h_t}" and "h_t" which are
+        # used in paper to "h_t" and "c_t"
+
+        configs = []
+        configs.append("### Begin Gru layer '{0}'".format(name))
+        configs.append("# Update gate control : W_z* matrices")
+        configs.append("component name={0}.W_z.xs type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + rec_proj_dim, cell_dim, affine_str))
+        configs.append("# Reset gate control : W_o* matrices")
+        configs.append("component name={0}.W_o.xs type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + rec_proj_dim, cell_dim, affine_str))
+
+
+        configs.append("# hpart_t related matrix : W_hpart matric")
+        configs.append("component name={0}.W_hpart.x type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim, cell_dim , affine_str))
+        
+        configs.append("# Defining the non-linearities")
+        configs.append("component name={0}.z type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
+        configs.append("component name={0}.o type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
+        
+        recurrent_connection = '{0}.s_t'.format(name)
+
+        configs.append("# z_t and o_t")
+        configs.append("component-node name={0}.z_t_pre component={0}.W_z.xs input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay))
+        configs.append("component-node name={0}.z_t component={0}.z input={0}.z_t_pre".format(name))
+        configs.append("component-node name={0}.o_t_pre component={0}.W_o.xs input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay))
+        configs.append("component-node name={0}.o_t component={0}.o input={0}.o_t_pre".format(name))
+
+        configs.append("# hpart_t")
+        configs.append("component-node name={0}.hpart_t component={0}.W_hpart.x input={1}".format(name, input_descriptor))
+        
+        configs.append("# c_t")
+        configs.append("# Note: the output of OutputGruNonlinearityComponent is (h_t, c_t), we use the second half.")
+        configs.append("component name={0}.gru_nonlin type=OutputGruNonlinearityComponent cell-dim={1} {2}".format(name, cell_dim, gru_nonlin_str))
+        configs.append("component-node name={0}.gru_nonlin_t component={0}.gru_nonlin input=Append({0}.z_t, {0}.hpart_t, IfDefined(Offset({0}.c_t, {1})))".format(name, delay))
+        configs.append("dim-range-node name={0}.c_t input-node={0}.gru_nonlin_t dim-offset={1} dim={1}".format(name, cell_dim))
+
+        configs.append("# the projected matrix W_y.cdoto and y_t")
+        configs.append("component name={0}.cdoto type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim))
+        configs.append("component-node name={0}.cdoto component={0}.cdoto input=Append({0}.c_t, {0}.o_t)".format(name))
+        configs.append("component name={0}.W_y.cdoto type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, cell_dim, rec_proj_dim + nonrec_proj_dim, affine_str))
+        configs.append("component-node name={0}.y_t component={0}.W_y.cdoto input={0}.cdoto".format(name))
+
+        configs.append("# s_t recurrence")
+        configs.append("component name={0}.s_r type=BackpropTruncationComponent dim={1} {2}".format(name, rec_proj_dim, bptrunc_str))
+        configs.append("dim-range-node name={0}.s_t_preclip input-node={0}.y_t dim-offset=0 dim={1}".format(name, rec_proj_dim))
+        configs.append("component-node name={0}.s_t component={0}.s_r input={0}.s_t_preclip".format(name))
+
+        return configs
+
+
+# This class is for lines like
+#   'fast-norm-opgru-layer name=opgru1 input=[-1] delay=-3'
+
+# Different from the vanilla OPGRU, the NormOPGRU uses batchnorm in the forward direction
+# and renorm in the recurrence.
+
+# The output dimension of the layer may be specified via 'cell-dim=xxx', but if not specified,
+# the dimension defaults to the same as the input.
+# See other configuration values below.
+#
+# Parameters of the class, and their defaults:
+#   input='[-1]'             [Descriptor giving the input of the layer.]
+#   cell-dim=-1            [Dimension of the cell]
+#   recurrent-projection_dim [Dimension of the projection used in recurrent connections, e.g. cell-dim/4]
+#   non-recurrent-projection-dim   [Dimension of the projection in non-recurrent connections,
+#                                   in addition to recurrent-projection-dim, e.g. cell-dim/4]
+#   delay=-1                 [Delay in the recurrent connections of the GRU ]
+#   clipping-threshold=30    [nnet3 GRU use a gradient clipping component at the recurrent connections.
+#                             This is the threshold used to decide if clipping has to be activated ]
+#   zeroing-interval=20      [interval at which we (possibly) zero out the recurrent derivatives.]
+#   zeroing-threshold=15     [We only zero out the derivs every zeroing-interval, if derivs exceed this value.]
+#   self_repair_scale_nonlinearity=1e-5      [It is a constant scaling the self-repair vector computed in derived classes of NonlinearComponent]
+#                                       i.e.,  SigmoidComponent, TanhComponent and RectifiedLinearComponent ]
+#   ng-per-element-scale-options=''   [Additional options used for the diagonal matrices in the GRU ]
+#   gru-nonlinearity-options=' max-change=0.75' [options for GruNonlinearityComponent, see below for detail]
+#   ng-affine-options=''              [Additional options used for the full matrices in the GRU, can be used to do things like set biases to initialize to 1]
+class XconfigFastNormOpgruLayer(XconfigLayerBase):
+    def __init__(self, first_token, key_to_value, prev_names = None):
+        assert first_token == "fast-norm-opgru-layer"
+        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)
+
+    def set_default_configs(self):
+        self.config = {'input' : '[-1]',
+                        'cell-dim' : -1, # this is a compulsory argument
+                        'recurrent-projection-dim' : -1,  # defaults to cell-dim / 4
+                        'non-recurrent-projection-dim' : -1, # defaults to
+                                                             # recurrent-projection-dim
+                        'clipping-threshold' : 30.0,
+                        'delay' : -1,
+                        'ng-per-element-scale-options' : ' max-change=0.75 ',
+                        'ng-affine-options' : ' max-change=0.75 ',
+                        'self-repair-scale-nonlinearity' : 0.00001,
+                        'zeroing-interval' : 20,
+                        'zeroing-threshold' : 15.0,
+                        # if you want to set 'self-repair-scale', ' self-repair-threshold'
+                        # or 'param-stddev' for GruNonlinearityComponent
+                        # For default, they are 1.0e-05, 0.2 and  1.0 / sqrt(d) where d is cell-dim.
+                        # you can add somethig like 'self-repair-scale=xxx' to gru-nonlinearity-options.
+                        # you can also see src/nnet3/nnet-combined-component.h for detail
+                        'gru-nonlinearity-options' : ' max-change=0.75',
+                        'dropout-proportion' : -1.0,  # If -1.0, no dropout components will be added
+                        'dropout-per-frame' : True  # If False, regular dropout, not per frame
+                       }
+
+    def set_derived_configs(self):
+        if self.config['recurrent-projection-dim'] <= 0:
+            self.config['recurrent-projection-dim'] = self.config['cell-dim'] / 4
+
+        if self.config['non-recurrent-projection-dim'] <= 0:
+            self.config['non-recurrent-projection-dim'] = \
+               self.config['recurrent-projection-dim']
+
+    def check_configs(self):
+        for key in ['cell-dim', 'recurrent-projection-dim',
+                    'non-recurrent-projection-dim']:
+            if self.config[key] <= 0:
+                raise RuntimeError("{0} has invalid value {1}.".format(
+                    key, self.config[key]))
+
+        if self.config['delay'] == 0:
+            raise RuntimeError("delay cannot be zero")
+
+        if (self.config['recurrent-projection-dim'] +
+            self.config['non-recurrent-projection-dim'] >
+            self.config['cell-dim']):
+            raise RuntimeError("recurrent+non-recurrent projection dim exceeds "
+                                "cell dim.")
+        for key in ['self-repair-scale-nonlinearity']:
+            if self.config[key] < 0.0 or self.config[key] > 1.0:
+                raise RuntimeError("{0} has invalid value {2}."
+                                   .format(self.layer_type, key,
+                                           self.config[key]))
+        if ((self.config['dropout-proportion'] > 1.0 or
+             self.config['dropout-proportion'] < 0.0) and
+             self.config['dropout-proportion'] != -1.0 ):
+             raise RuntimeError("dropout-proportion has invalid value {0}."
+                                .format(self.config['dropout-proportion']))
+
+    def auxiliary_outputs(self):
+        return ['c_t']
+
+    def output_name(self, auxiliary_output = None):
+        node_name = 'y_t'
+        if auxiliary_output is not None:
+            if auxiliary_output in self.auxiliary_outputs():
+                node_name = auxiliary_output
+            else:
+                raise Exception("In {0} of type {1}, unknown auxiliary output name {1}".format(self.layer_type, auxiliary_output))
+
+        return '{0}.{1}'.format(self.name, node_name)
+
+    def output_dim(self, auxiliary_output = None):
+        if auxiliary_output is not None:
+            if auxiliary_output in self.auxiliary_outputs():
+                if node_name == 'c_t':
+                    return self.config['cell-dim']
+                # add code for other auxiliary_outputs here when we decide to expose them
+            else:
+                raise Exception("In {0} of type {1}, unknown auxiliary output name {1}".format(self.layer_type, auxiliary_output))
+
+        return self.config['recurrent-projection-dim'] + self.config['non-recurrent-projection-dim']
+
+    def get_full_config(self):
+        ans = []
+        config_lines = self.generate_pgru_config()
+
+        for line in config_lines:
+            for config_name in ['ref', 'final']:
+                # we do not support user specified matrices in LSTM initialization
+                # so 'ref' and 'final' configs are the same.
+                ans.append((config_name, line))
+        return ans
+
+    # convenience function to generate the Norm-OPGRU config
+    def generate_pgru_config(self):
+
+        # assign some variables to reduce verbosity
+        name = self.name
+        # in the below code we will just call descriptor_strings as descriptors for conciseness
+        input_dim = self.descriptors['input']['dim']
+        input_descriptor = self.descriptors['input']['final-string']
+        cell_dim = self.config['cell-dim']
+        rec_proj_dim = self.config['recurrent-projection-dim']
+        nonrec_proj_dim = self.config['non-recurrent-projection-dim']
+        delay = self.config['delay']
+        repair_nonlin = self.config['self-repair-scale-nonlinearity']
+        repair_nonlin_str = "self-repair-scale={0:.10f}".format(repair_nonlin) if repair_nonlin is not None else ''
+        bptrunc_str = ("clipping-threshold={0}"
+                      " zeroing-threshold={1}"
+                      " zeroing-interval={2}"
+                      " recurrence-interval={3}"
+                      "".format(self.config['clipping-threshold'],
+                                self.config['zeroing-threshold'],
+                                self.config['zeroing-interval'],
+                                abs(delay)))
+        affine_str = self.config['ng-affine-options']
+        pes_str = self.config['ng-per-element-scale-options']
+        dropout_proportion = self.config['dropout-proportion']
+        dropout_per_frame = 'true' if self.config['dropout-per-frame'] else 'false' 
+
+        # Natural gradient per element scale parameters
+        # TODO: decide if we want to keep exposing these options
+        if re.search('param-mean', pes_str) is None and \
+           re.search('param-stddev', pes_str) is None:
+           pes_str += " param-mean=0.0 param-stddev=1.0 "
+
+        # string for GruNonlinearityComponent
+        gru_nonlin_str = self.config['gru-nonlinearity-options']
+        
+        # formulation like:
+        # z_t = \sigmoid ( U^z x_t + W^z s_{t-1} )   # update gate
+        # o_t = \sigmoid ( U^o x_t + W^o s_{t-1} )   # output gate
+        # h_t = \tanh ( U^h x_t + W^h \dot c_{t-1} )
+        # c_t = ( 1 - z_t ) \dot h_t  +  z_t \dot c_{t-1}
+        # y_t_tmp = ( c_t \dot o_t ) W^y
+        # s_t = renorm ( y_t_tmp[0:rec_proj_dim-1] ) # dim(s_t) = recurrent_dim.
+        # y_t = batchnorm ( y_t_tmp )  # dim(y_t) = recurrent_dim + non_recurrent_dim.
+                                       # This is the output of the GRU.
+        # Note:
+        # naming convention:
+        # <layer-name>.W_<outputname>.<inputname> e.g. Gru1.W_i.xr for matrix
+        # providing output to gate i and operating on an appended vector [x,r]
+        # notation convention:
+        # In order to be consistent with the notations which are used in
+        # nnet-combined-component.cc, we map "\tilde{h_t}" and "h_t" which are
+        # used in paper to "h_t" and "c_t"
+
+        configs = []
+        configs.append("### Begin Gru layer '{0}'".format(name))
+        configs.append("# Update gate control : W_z* matrices")
+        configs.append("component name={0}.W_z.xs type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + rec_proj_dim, cell_dim, affine_str))
+        configs.append("# Reset gate control : W_o* matrices")
+        configs.append("component name={0}.W_o.xs type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + rec_proj_dim, cell_dim, affine_str))
+
+
+        configs.append("# hpart_t related matrix : W_hpart matric")
+        configs.append("component name={0}.W_hpart.x type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim, cell_dim , affine_str))
+        
+        configs.append("# Defining the non-linearities")
+        configs.append("component name={0}.z type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
+        configs.append("component name={0}.o type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
+
+        if dropout_proportion != -1.0:
+            configs.append("# Defining the dropout component")
+            configs.append("component name={0}.dropout type=DropoutComponent dim={1} "
+                           "dropout-proportion={2} dropout-per-frame={3}"
+                           .format(name, cell_dim, dropout_proportion, dropout_per_frame))
+
+        recurrent_connection = '{0}.s_t'.format(name)
+
+        configs.append("# z_t")
+        configs.append("component-node name={0}.z_t_pre component={0}.W_z.xs input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay))
+        if dropout_proportion != -1.0:
+            configs.append("component-node name={0}.z_t_predrop component={0}.z input={0}.z_t_pre".format(name))
+            configs.append("component-node name={0}.z_t component={0}.dropout input={0}.z_t_predrop".format(name))
+        else:
+            configs.append("component-node name={0}.z_t component={0}.z input={0}.z_t_pre".format(name))
+
+        configs.append("# o_t")
+        configs.append("component-node name={0}.o_t_pre component={0}.W_o.xs input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay))
+        if dropout_proportion != -1.0:
+            configs.append("component-node name={0}.o_t_predrop component={0}.o input={0}.o_t_pre".format(name))
+            configs.append("component-node name={0}.o_t component={0}.dropout input={0}.o_t_predrop".format(name))
+        else:
+            configs.append("component-node name={0}.o_t component={0}.o input={0}.o_t_pre".format(name))
+
+        configs.append("# hpart_t")
+        configs.append("component-node name={0}.hpart_t component={0}.W_hpart.x input={1}".format(name, input_descriptor))
+        
+        configs.append("# c_t")
+        configs.append("# Note: the output of OutputGruNonlinearityComponent is (h_t, c_t), we use the second half.")
+        configs.append("component name={0}.gru_nonlin type=OutputGruNonlinearityComponent cell-dim={1} {2}".format(name, cell_dim, gru_nonlin_str))
+        configs.append("component-node name={0}.gru_nonlin_t component={0}.gru_nonlin input=Append({0}.z_t, {0}.hpart_t, IfDefined(Offset({0}.c_t, {1})))".format(name, delay))
+        configs.append("dim-range-node name={0}.c_t input-node={0}.gru_nonlin_t dim-offset={1} dim={1}".format(name, cell_dim))
+
+        configs.append("# the projected matrix W_y.cdoto and y_t_tmp")
+        configs.append("component name={0}.cdoto type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim))
+        configs.append("component-node name={0}.cdoto component={0}.cdoto input=Append({0}.c_t, {0}.o_t)".format(name))
+        configs.append("component name={0}.W_y.cdoto type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, cell_dim, rec_proj_dim + nonrec_proj_dim, affine_str))
+        configs.append("component-node name={0}.y_t_tmp component={0}.W_y.cdoto input={0}.cdoto".format(name))
+
+        configs.append("# s_t : recurrence")
+        configs.append("component name={0}.renorm type=NormalizeComponent dim={1} target-rms=1.0".format(name, rec_proj_dim))
+        configs.append("component name={0}.s_r type=BackpropTruncationComponent dim={1} {2}".format(name, rec_proj_dim, bptrunc_str))
+        configs.append("dim-range-node name={0}.s_t_pre input-node={0}.y_t_tmp dim-offset=0 dim={1}".format(name, rec_proj_dim))
+        configs.append("component-node name={0}.s_t_renorm component={0}.renorm input={0}.s_t_pre".format(name))
+        configs.append("component-node name={0}.s_t component={0}.s_r input={0}.s_t_renorm".format(name))
+
+        configs.append("# y_t : output")
+        configs.append("component name={0}.batchnorm type=BatchNormComponent dim={1} target-rms=1.0".format(name, rec_proj_dim + nonrec_proj_dim))
+        configs.append("component-node name={0}.y_t component={0}.batchnorm input={0}.y_t_tmp".format(name))
+        
+        return configs
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
index 01c1b1e533c..ca1f7d8372f 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
@@ -68,6 +68,11 @@
         'opgru-layer' : xlayers.XconfigOpgruLayer,
         'norm-pgru-layer' : xlayers.XconfigNormPgruLayer,
         'norm-opgru-layer' : xlayers.XconfigNormOpgruLayer,
+        'fast-gru-layer' : xlayers.XconfigFastGruLayer,
+        'fast-pgru-layer' : xlayers.XconfigFastPgruLayer,
+        'fast-norm-pgru-layer' : xlayers.XconfigFastNormPgruLayer,
+        'fast-opgru-layer' : xlayers.XconfigFastOpgruLayer,
+        'fast-norm-opgru-layer' : xlayers.XconfigFastNormOpgruLayer,
         'tdnnf-layer': xlayers.XconfigTdnnfLayer,
         'prefinal-layer': xlayers.XconfigPrefinalLayer,
         'renorm-component': xlayers.XconfigRenormComponent,
diff --git a/egs/wsj/s5/steps/nnet3/get_saturation.pl b/egs/wsj/s5/steps/nnet3/get_saturation.pl
index ed18fc1c399..979736f0847 100755
--- a/egs/wsj/s5/steps/nnet3/get_saturation.pl
+++ b/egs/wsj/s5/steps/nnet3/get_saturation.pl
@@ -74,6 +74,14 @@
     if (! $ok) {
       print STDERR "Could not parse at least one of the avg-deriv values in the following info line: $_";
     }
+  } elsif (m/type=.*GruNonlinearityComponent/) {
+    if (m/deriv-avg=[^m]+mean=([^,]+),/) {
+      $num_nonlinearities += 1;
+      my $this_saturation = 1.0 - ($1 / 1.0);
+      $total_saturation += $this_saturation;
+    } else {
+      print STDERR "$0: could not make sense of line (no deriv-avg?): $_";
+    }
   }
 }
 
diff --git a/src/nnet3/Makefile b/src/nnet3/Makefile
index 135853cadc3..6214592303b 100644
--- a/src/nnet3/Makefile
+++ b/src/nnet3/Makefile
@@ -15,7 +15,7 @@ TESTFILES = natural-gradient-online-test nnet-graph-test \
   nnet-common-test convolution-test attention-test
 
 OBJFILES = nnet-common.o nnet-compile.o nnet-component-itf.o \
-  nnet-simple-component.o nnet-normalize-component.o \
+  nnet-simple-component.o nnet-combined-component.o nnet-normalize-component.o \
   nnet-general-component.o nnet-parse.o natural-gradient-online.o \
   nnet-descriptor.o nnet-optimize.o nnet-computation.o \
   nnet-computation-graph.o nnet-graph.o am-nnet-simple.o \
diff --git a/src/nnet3/nnet-combined-component.cc b/src/nnet3/nnet-combined-component.cc
new file mode 100644
index 00000000000..0a2fb3f5a91
--- /dev/null
+++ b/src/nnet3/nnet-combined-component.cc
@@ -0,0 +1,2332 @@
+// nnet3/nnet-combined-component.cc
+
+// Copyright 2015-2018  Johns Hopkins University (author: Daniel Povey)
+//                2015  Daniel Galvez
+//                2018  Hang Lyu
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include <iterator>
+#include <sstream>
+#include <algorithm>
+#include <iomanip>
+#include "nnet3/nnet-combined-component.h"
+#include "nnet3/nnet-parse.h"
+#include "cudamatrix/cu-math.h"
+
+namespace kaldi {
+namespace nnet3 {
+
+// Constructors for the convolution component
+ConvolutionComponent::ConvolutionComponent():
+    UpdatableComponent(),
+    input_x_dim_(0), input_y_dim_(0), input_z_dim_(0),
+    filt_x_dim_(0), filt_y_dim_(0),
+    filt_x_step_(0), filt_y_step_(0),
+    input_vectorization_(kZyx) { }
+
+ConvolutionComponent::ConvolutionComponent(
+    const ConvolutionComponent &component):
+    UpdatableComponent(component),
+    input_x_dim_(component.input_x_dim_),
+    input_y_dim_(component.input_y_dim_),
+    input_z_dim_(component.input_z_dim_),
+    filt_x_dim_(component.filt_x_dim_),
+    filt_y_dim_(component.filt_y_dim_),
+    filt_x_step_(component.filt_x_step_),
+    filt_y_step_(component.filt_y_step_),
+    input_vectorization_(component.input_vectorization_),
+    filter_params_(component.filter_params_),
+    bias_params_(component.bias_params_) { }
+
+ConvolutionComponent::ConvolutionComponent(
+    const CuMatrixBase<BaseFloat> &filter_params,
+    const CuVectorBase<BaseFloat> &bias_params,
+    int32 input_x_dim, int32 input_y_dim, int32 input_z_dim,
+    int32 filt_x_dim, int32 filt_y_dim,
+    int32 filt_x_step, int32 filt_y_step,
+    TensorVectorizationType input_vectorization,
+    BaseFloat learning_rate):
+    input_x_dim_(input_x_dim),
+    input_y_dim_(input_y_dim),
+    input_z_dim_(input_z_dim),
+    filt_x_dim_(filt_x_dim),
+    filt_y_dim_(filt_y_dim),
+    filt_x_step_(filt_x_step),
+    filt_y_step_(filt_y_step),
+    input_vectorization_(input_vectorization),
+    filter_params_(filter_params),
+    bias_params_(bias_params){
+  KALDI_ASSERT(filter_params.NumRows() == bias_params.Dim() &&
+               bias_params.Dim() != 0);
+  KALDI_ASSERT(filter_params.NumCols() == filt_x_dim * filt_y_dim * input_z_dim);
+  SetUnderlyingLearningRate(learning_rate);
+  is_gradient_ = false;
+}
+
+// aquire input dim
+int32 ConvolutionComponent::InputDim() const {
+  return input_x_dim_ * input_y_dim_ * input_z_dim_;
+}
+
+// aquire output dim
+int32 ConvolutionComponent::OutputDim() const {
+  int32 num_x_steps = (1 + (input_x_dim_ - filt_x_dim_) / filt_x_step_);
+  int32 num_y_steps = (1 + (input_y_dim_ - filt_y_dim_) / filt_y_step_);
+  int32 num_filters = filter_params_.NumRows();
+  return num_x_steps * num_y_steps * num_filters;
+}
+
+// initialize the component using hyperparameters
+void ConvolutionComponent::Init(
+    int32 input_x_dim, int32 input_y_dim, int32 input_z_dim,
+    int32 filt_x_dim, int32 filt_y_dim,
+    int32 filt_x_step, int32 filt_y_step, int32 num_filters,
+    TensorVectorizationType input_vectorization,
+    BaseFloat param_stddev, BaseFloat bias_stddev) {
+  input_x_dim_ = input_x_dim;
+  input_y_dim_ = input_y_dim;
+  input_z_dim_ = input_z_dim;
+  filt_x_dim_ = filt_x_dim;
+  filt_y_dim_ = filt_y_dim;
+  filt_x_step_ = filt_x_step;
+  filt_y_step_ = filt_y_step;
+  input_vectorization_ = input_vectorization;
+  KALDI_ASSERT((input_x_dim_ - filt_x_dim_) % filt_x_step_ == 0);
+  KALDI_ASSERT((input_y_dim_ - filt_y_dim_) % filt_y_step_ == 0);
+  int32 filter_dim = filt_x_dim_ * filt_y_dim_ * input_z_dim_;
+  filter_params_.Resize(num_filters, filter_dim);
+  bias_params_.Resize(num_filters);
+  KALDI_ASSERT(param_stddev >= 0.0 && bias_stddev >= 0.0);
+  filter_params_.SetRandn();
+  filter_params_.Scale(param_stddev);
+  bias_params_.SetRandn();
+  bias_params_.Scale(bias_stddev);
+}
+
+// initialize the component using predefined matrix file
+void ConvolutionComponent::Init(
+    int32 input_x_dim, int32 input_y_dim, int32 input_z_dim,
+    int32 filt_x_dim, int32 filt_y_dim,
+    int32 filt_x_step, int32 filt_y_step,
+    TensorVectorizationType input_vectorization,
+    std::string matrix_filename) {
+  input_x_dim_ = input_x_dim;
+  input_y_dim_ = input_y_dim;
+  input_z_dim_ = input_z_dim;
+  filt_x_dim_ = filt_x_dim;
+  filt_y_dim_ = filt_y_dim;
+  filt_x_step_ = filt_x_step;
+  filt_y_step_ = filt_y_step;
+  input_vectorization_ = input_vectorization;
+  CuMatrix<BaseFloat> mat;
+  ReadKaldiObject(matrix_filename, &mat);
+  int32 filter_dim = (filt_x_dim_ * filt_y_dim_ * input_z_dim_);
+  int32 num_filters = mat.NumRows();
+  KALDI_ASSERT(mat.NumCols() == (filter_dim + 1));
+  filter_params_.Resize(num_filters, filter_dim);
+  bias_params_.Resize(num_filters);
+  filter_params_.CopyFromMat(mat.Range(0, num_filters, 0, filter_dim));
+  bias_params_.CopyColFromMat(mat, filter_dim);
+}
+
+// display information about component
+std::string ConvolutionComponent::Info() const {
+  std::ostringstream stream;
+  stream << UpdatableComponent::Info()
+         << ", input-x-dim=" << input_x_dim_
+         << ", input-y-dim=" << input_y_dim_
+         << ", input-z-dim=" << input_z_dim_
+         << ", filt-x-dim=" << filt_x_dim_
+         << ", filt-y-dim=" << filt_y_dim_
+         << ", filt-x-step=" << filt_x_step_
+         << ", filt-y-step=" << filt_y_step_
+         << ", input-vectorization=" << input_vectorization_
+         << ", num-filters=" << filter_params_.NumRows();
+  PrintParameterStats(stream, "filter-params", filter_params_);
+  PrintParameterStats(stream, "bias-params", bias_params_, true);
+  return stream.str();
+}
+
+// initialize the component using configuration file
+void ConvolutionComponent::InitFromConfig(ConfigLine *cfl) {
+  bool ok = true;
+  std::string matrix_filename;
+  int32 input_x_dim = -1, input_y_dim = -1, input_z_dim = -1,
+        filt_x_dim = -1, filt_y_dim = -1,
+        filt_x_step = -1, filt_y_step = -1,
+        num_filters = -1;
+  std::string input_vectorization_order = "zyx";
+  InitLearningRatesFromConfig(cfl);
+  ok = ok && cfl->GetValue("input-x-dim", &input_x_dim);
+  ok = ok && cfl->GetValue("input-y-dim", &input_y_dim);
+  ok = ok && cfl->GetValue("input-z-dim", &input_z_dim);
+  ok = ok && cfl->GetValue("filt-x-dim", &filt_x_dim);
+  ok = ok && cfl->GetValue("filt-y-dim", &filt_y_dim);
+  ok = ok && cfl->GetValue("filt-x-step", &filt_x_step);
+  ok = ok && cfl->GetValue("filt-y-step", &filt_y_step);
+
+  if (!ok)
+    KALDI_ERR << "Bad initializer " << cfl->WholeLine();
+  // optional argument
+  TensorVectorizationType input_vectorization;
+  cfl->GetValue("input-vectorization-order", &input_vectorization_order);
+  if (input_vectorization_order.compare("zyx") == 0) {
+    input_vectorization = kZyx;
+  } else if (input_vectorization_order.compare("yzx") == 0) {
+    input_vectorization = kYzx;
+  } else {
+    KALDI_ERR << "Unknown or unsupported input vectorization order "
+              << input_vectorization_order
+              << " accepted candidates are 'yzx' and 'zyx'";
+  }
+
+  if (cfl->GetValue("matrix", &matrix_filename)) {
+    // initialize from prefined parameter matrix
+    Init(input_x_dim, input_y_dim, input_z_dim,
+         filt_x_dim, filt_y_dim,
+         filt_x_step, filt_y_step,
+         input_vectorization,
+         matrix_filename);
+  } else {
+    ok = ok && cfl->GetValue("num-filters", &num_filters);
+    if (!ok)
+      KALDI_ERR << "Bad initializer " << cfl->WholeLine();
+    // initialize from configuration
+    int32 filter_input_dim = filt_x_dim * filt_y_dim * input_z_dim;
+    BaseFloat param_stddev = 1.0 / std::sqrt(filter_input_dim), bias_stddev = 1.0;
+    cfl->GetValue("param-stddev", &param_stddev);
+    cfl->GetValue("bias-stddev", &bias_stddev);
+    Init(input_x_dim, input_y_dim, input_z_dim,
+         filt_x_dim, filt_y_dim, filt_x_step, filt_y_step, num_filters,
+         input_vectorization, param_stddev, bias_stddev);
+  }
+  if (cfl->HasUnusedValues())
+    KALDI_ERR << "Could not process these elements in initializer: "
+              << cfl->UnusedValues();
+  if (!ok)
+    KALDI_ERR << "Bad initializer " << cfl->WholeLine();
+}
+
+// Inline methods to convert from tensor index i.e., (x,y,z) index
+// to index in yzx or zyx vectorized tensors
+inline int32 YzxVectorIndex(int32 x, int32 y, int32 z,
+                            int32 input_x_dim,
+                            int32 input_y_dim,
+                            int32 input_z_dim) {
+  KALDI_PARANOID_ASSERT(x < input_x_dim && y < input_y_dim && z < input_z_dim);
+  return (input_y_dim * input_z_dim) * x + (input_y_dim) * z + y;
+}
+
+inline int32 ZyxVectorIndex(int32 x, int32 y, int32 z,
+                            int32 input_x_dim,
+                            int32 input_y_dim,
+                            int32 input_z_dim) {
+  KALDI_PARANOID_ASSERT(x < input_x_dim && y < input_y_dim && z < input_z_dim);
+  return (input_y_dim * input_z_dim) * x + (input_z_dim) * y + z;
+}
+
+// Method to convert from a matrix representing a minibatch of vectorized
+// 3D tensors to patches for convolution, each patch corresponds to
+// one dot product in the convolution
+void ConvolutionComponent::InputToInputPatches(
+    const CuMatrixBase<BaseFloat>& in,
+    CuMatrix<BaseFloat> *patches) const{
+  int32 num_x_steps = (1 + (input_x_dim_ - filt_x_dim_) / filt_x_step_);
+  int32 num_y_steps = (1 + (input_y_dim_ - filt_y_dim_) / filt_y_step_);
+  const int32 filt_x_step = filt_x_step_,
+              filt_y_step = filt_y_step_,
+              filt_x_dim = filt_x_dim_,
+              filt_y_dim = filt_y_dim_,
+              input_x_dim = input_x_dim_,
+              input_y_dim = input_y_dim_,
+              input_z_dim = input_z_dim_,
+              filter_dim = filter_params_.NumCols();
+
+  std::vector<int32> column_map(patches->NumCols());
+  int32 column_map_size = column_map.size();
+  for (int32 x_step = 0; x_step < num_x_steps; x_step++) {
+    for (int32 y_step = 0; y_step < num_y_steps; y_step++)  {
+      int32 patch_number = x_step * num_y_steps + y_step;
+      int32 patch_start_index = patch_number * filter_dim;
+      for (int32 x = 0, index = patch_start_index; x < filt_x_dim; x++)  {
+        for (int32 y = 0; y < filt_y_dim; y++)  {
+          for (int32 z = 0; z < input_z_dim; z++, index++)  {
+            KALDI_ASSERT(index < column_map_size);
+            if (input_vectorization_ == kZyx)  {
+              column_map[index] = ZyxVectorIndex(x_step * filt_x_step + x,
+                                                 y_step * filt_y_step + y, z,
+                                                 input_x_dim, input_y_dim,
+                                                 input_z_dim);
+            } else if (input_vectorization_ == kYzx)  {
+              column_map[index] = YzxVectorIndex(x_step * filt_x_step + x,
+                                                  y_step * filt_y_step + y, z,
+                                                  input_x_dim, input_y_dim,
+                                                  input_z_dim);
+            }
+          }
+        }
+      }
+    }
+  }
+  CuArray<int32> cu_cols(column_map);
+  patches->CopyCols(in, cu_cols);
+}
+
+
+// propagation function
+// see function declaration in nnet-simple-component.h for details
+void* ConvolutionComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
+                                         const CuMatrixBase<BaseFloat> &in,
+                                         CuMatrixBase<BaseFloat> *out) const {
+  const int32 num_x_steps = (1 + (input_x_dim_ - filt_x_dim_) / filt_x_step_),
+              num_y_steps = (1 + (input_y_dim_ - filt_y_dim_) / filt_y_step_),
+              num_filters = filter_params_.NumRows(),
+              num_frames = in.NumRows(),
+              filter_dim = filter_params_.NumCols();
+  KALDI_ASSERT((*out).NumRows() == num_frames &&
+               (*out).NumCols() == (num_filters * num_x_steps * num_y_steps));
+
+  CuMatrix<BaseFloat> patches(num_frames,
+                              num_x_steps * num_y_steps * filter_dim,
+                              kUndefined);
+  InputToInputPatches(in, &patches);
+  CuSubMatrix<BaseFloat>* filter_params_elem = new CuSubMatrix<BaseFloat>(
+      filter_params_, 0, filter_params_.NumRows(), 0, filter_params_.NumCols());
+  std::vector<CuSubMatrix<BaseFloat>* > tgt_batch, patch_batch,
+      filter_params_batch;
+
+  for (int32 x_step = 0; x_step < num_x_steps; x_step++)  {
+    for (int32 y_step = 0; y_step < num_y_steps; y_step++)  {
+      int32 patch_number = x_step * num_y_steps + y_step;
+      tgt_batch.push_back(new CuSubMatrix<BaseFloat>(
+              out->ColRange(patch_number * num_filters, num_filters)));
+      patch_batch.push_back(new CuSubMatrix<BaseFloat>(
+              patches.ColRange(patch_number * filter_dim, filter_dim)));
+      filter_params_batch.push_back(filter_params_elem);
+      tgt_batch[patch_number]->AddVecToRows(1.0, bias_params_, 1.0); // add bias
+    }
+  }
+  // apply all filters
+  AddMatMatBatched<BaseFloat>(1.0, tgt_batch, patch_batch,
+                              kNoTrans, filter_params_batch,
+                              kTrans, 1.0);
+  // release memory
+  delete filter_params_elem;
+  for (int32 p = 0; p < tgt_batch.size(); p++) {
+    delete tgt_batch[p];
+    delete patch_batch[p];
+  }
+  return NULL;
+}
+
+// scale the parameters
+void ConvolutionComponent::Scale(BaseFloat scale) {
+  if (scale == 0.0) {
+    filter_params_.SetZero();
+    bias_params_.SetZero();
+  } else {
+    filter_params_.Scale(scale);
+    bias_params_.Scale(scale);
+  }
+}
+
+// add another convolution component
+void ConvolutionComponent::Add(BaseFloat alpha, const Component &other_in) {
+  const ConvolutionComponent *other =
+      dynamic_cast<const ConvolutionComponent*>(&other_in);
+  KALDI_ASSERT(other != NULL);
+  filter_params_.AddMat(alpha, other->filter_params_);
+  bias_params_.AddVec(alpha, other->bias_params_);
+}
+
+/*
+ This function transforms a vector of lists into a list of vectors,
+ padded with -1.
+ @param[in] The input vector of lists. Let in.size() be D, and let
+            the longest list length (i.e. the max of in[i].size()) be L.
+ @param[out] The output list of vectors. The length of the list will
+            be L, each vector-dimension will be D (i.e. out[i].size() == D),
+            and if in[i] == j, then for some k we will have that
+            out[k][j] = i. The output vectors are padded with -1
+            where necessary if not all the input lists have the same side.
+*/
+void RearrangeIndexes(const std::vector<std::vector<int32> > &in,
+                                                std::vector<std::vector<int32> > *out) {
+  int32 D = in.size();
+  int32 L = 0;
+  for (int32 i = 0; i < D; i++)
+    if (in[i].size() > L)
+      L = in[i].size();
+  out->resize(L);
+  for (int32 i = 0; i < L; i++)
+    (*out)[i].resize(D, -1);
+  for (int32 i = 0; i < D; i++) {
+    for (int32 j = 0; j < in[i].size(); j++) {
+      (*out)[j][i] = in[i][j];
+    }
+  }
+}
+
+// Method to compute the input derivative matrix from the input derivatives
+// for patches, where each patch corresponds to one dot product
+// in the convolution
+void ConvolutionComponent::InderivPatchesToInderiv(
+    const CuMatrix<BaseFloat>& in_deriv_patches,
+    CuMatrixBase<BaseFloat> *in_deriv) const {
+
+  const int32 num_x_steps = (1 + (input_x_dim_ - filt_x_dim_) / filt_x_step_),
+              num_y_steps = (1 + (input_y_dim_ - filt_y_dim_) / filt_y_step_),
+              filt_x_step = filt_x_step_,
+              filt_y_step = filt_y_step_,
+              filt_x_dim = filt_x_dim_,
+              filt_y_dim = filt_y_dim_,
+              input_x_dim = input_x_dim_,
+              input_y_dim = input_y_dim_,
+              input_z_dim = input_z_dim_,
+              filter_dim = filter_params_.NumCols();
+
+  // Compute the reverse column_map from the matrix with input
+  // derivative patches to input derivative matrix
+  std::vector<std::vector<int32> > reverse_column_map(in_deriv->NumCols());
+  int32 rev_col_map_size = reverse_column_map.size();
+  for (int32 x_step = 0; x_step < num_x_steps; x_step++) {
+    for (int32 y_step = 0; y_step < num_y_steps; y_step++)  {
+      int32 patch_number = x_step * num_y_steps + y_step;
+      int32 patch_start_index = patch_number * filter_dim;
+      for (int32 x = 0, index = patch_start_index; x < filt_x_dim; x++)  {
+        for (int32 y = 0; y < filt_y_dim; y++)  {
+          for (int32 z = 0; z < input_z_dim; z++, index++)  {
+            int32 vector_index;
+            if (input_vectorization_ == kZyx)  {
+              vector_index = ZyxVectorIndex(x_step * filt_x_step + x,
+                                            y_step * filt_y_step + y, z,
+                                            input_x_dim, input_y_dim,
+                                            input_z_dim);
+            } else {
+              KALDI_ASSERT(input_vectorization_ == kYzx);
+              vector_index = YzxVectorIndex(x_step * filt_x_step + x,
+                                            y_step * filt_y_step + y, z,
+                                            input_x_dim, input_y_dim,
+                                            input_z_dim);
+            }
+            KALDI_ASSERT(vector_index < rev_col_map_size);
+            reverse_column_map[vector_index].push_back(index);
+          }
+        }
+      }
+    }
+  }
+  std::vector<std::vector<int32> > rearranged_column_map;
+  RearrangeIndexes(reverse_column_map, &rearranged_column_map);
+  for (int32 p = 0; p < rearranged_column_map.size(); p++) {
+    CuArray<int32> cu_cols(rearranged_column_map[p]);
+    in_deriv->AddCols(in_deriv_patches, cu_cols);
+  }
+}
+
+// back propagation function
+// see function declaration in nnet-simple-component.h for details
+void ConvolutionComponent::Backprop(const std::string &debug_info,
+                                    const ComponentPrecomputedIndexes *indexes,
+                                    const CuMatrixBase<BaseFloat> &in_value,
+                                    const CuMatrixBase<BaseFloat> &, // out_value,
+                                    const CuMatrixBase<BaseFloat> &out_deriv,
+                                    void *memo,
+                                    Component *to_update_in,
+                                    CuMatrixBase<BaseFloat> *in_deriv) const {
+  ConvolutionComponent *to_update =
+      dynamic_cast<ConvolutionComponent*>(to_update_in);
+  const int32 num_x_steps = (1 + (input_x_dim_ - filt_x_dim_) / filt_x_step_),
+              num_y_steps = (1 + (input_y_dim_ - filt_y_dim_) / filt_y_step_),
+              num_filters = filter_params_.NumRows(),
+              num_frames = out_deriv.NumRows(),
+              filter_dim = filter_params_.NumCols();
+
+  KALDI_ASSERT(out_deriv.NumRows() == num_frames &&
+               out_deriv.NumCols() ==
+               (num_filters * num_x_steps * num_y_steps));
+
+  // Compute inderiv patches
+  CuMatrix<BaseFloat> in_deriv_patches(num_frames,
+                                       num_x_steps * num_y_steps * filter_dim,
+                                       kSetZero);
+
+  std::vector<CuSubMatrix<BaseFloat>* > patch_deriv_batch, out_deriv_batch,
+      filter_params_batch;
+  CuSubMatrix<BaseFloat>* filter_params_elem = new CuSubMatrix<BaseFloat>(
+      filter_params_, 0, filter_params_.NumRows(), 0, filter_params_.NumCols());
+
+  for (int32 x_step = 0; x_step < num_x_steps; x_step++)  {
+    for (int32 y_step = 0; y_step < num_y_steps; y_step++)  {
+      int32 patch_number = x_step * num_y_steps + y_step;
+
+      patch_deriv_batch.push_back(new CuSubMatrix<BaseFloat>(
+              in_deriv_patches.ColRange(
+              patch_number * filter_dim, filter_dim)));
+      out_deriv_batch.push_back(new CuSubMatrix<BaseFloat>(out_deriv.ColRange(
+              patch_number * num_filters, num_filters)));
+      filter_params_batch.push_back(filter_params_elem);
+    }
+  }
+  AddMatMatBatched<BaseFloat>(1.0, patch_deriv_batch,
+                              out_deriv_batch, kNoTrans,
+                              filter_params_batch, kNoTrans, 0.0);
+
+  if (in_deriv) {
+    // combine the derivatives from the individual input deriv patches
+    // to compute input deriv matrix
+    InderivPatchesToInderiv(in_deriv_patches, in_deriv);
+  }
+
+  if (to_update != NULL)  {
+    to_update->Update(debug_info, in_value, out_deriv, out_deriv_batch);
+  }
+
+  // release memory
+  delete filter_params_elem;
+  for (int32 p = 0; p < patch_deriv_batch.size(); p++) {
+    delete patch_deriv_batch[p];
+    delete out_deriv_batch[p];
+  }
+}
+
+
+// update parameters
+// see function declaration in nnet-simple-component.h for details
+void ConvolutionComponent::Update(const std::string &debug_info,
+                                  const CuMatrixBase<BaseFloat> &in_value,
+                                  const CuMatrixBase<BaseFloat> &out_deriv,
+                                  const std::vector<CuSubMatrix<BaseFloat> *>& out_deriv_batch) {
+  // useful dims
+  const int32 num_x_steps = (1 + (input_x_dim_ - filt_x_dim_) / filt_x_step_),
+              num_y_steps = (1 + (input_y_dim_ - filt_y_dim_) / filt_y_step_),
+              num_filters = filter_params_.NumRows(),
+              num_frames = out_deriv.NumRows(),
+              filter_dim = filter_params_.NumCols();
+  KALDI_ASSERT(out_deriv.NumRows() == num_frames &&
+               out_deriv.NumCols() ==
+               (num_filters * num_x_steps * num_y_steps));
+
+
+  CuMatrix<BaseFloat> filters_grad;
+  CuVector<BaseFloat> bias_grad;
+
+  CuMatrix<BaseFloat> input_patches(num_frames,
+                                    filter_dim * num_x_steps * num_y_steps,
+                                    kUndefined);
+  InputToInputPatches(in_value, &input_patches);
+
+  filters_grad.Resize(num_filters, filter_dim, kSetZero); // reset
+  bias_grad.Resize(num_filters, kSetZero); // reset
+
+  // create a single large matrix holding the smaller matrices
+  // from the vector container filters_grad_batch along the rows
+  CuMatrix<BaseFloat> filters_grad_blocks_batch(
+      num_x_steps * num_y_steps * filters_grad.NumRows(),
+      filters_grad.NumCols());
+
+  std::vector<CuSubMatrix<BaseFloat>* > filters_grad_batch, input_patch_batch;
+
+  for (int32 x_step = 0; x_step < num_x_steps; x_step++)  {
+    for (int32 y_step = 0; y_step < num_y_steps; y_step++)  {
+      int32 patch_number = x_step * num_y_steps + y_step;
+      filters_grad_batch.push_back(new CuSubMatrix<BaseFloat>(
+          filters_grad_blocks_batch.RowRange(
+              patch_number * filters_grad.NumRows(), filters_grad.NumRows())));
+
+      input_patch_batch.push_back(new CuSubMatrix<BaseFloat>(
+              input_patches.ColRange(patch_number * filter_dim, filter_dim)));
+    }
+  }
+
+  AddMatMatBatched<BaseFloat>(1.0, filters_grad_batch, out_deriv_batch, kTrans,
+                              input_patch_batch, kNoTrans, 1.0);
+
+  // add the row blocks together to filters_grad
+  filters_grad.AddMatBlocks(1.0, filters_grad_blocks_batch);
+
+  // create a matrix holding the col blocks sum of out_deriv
+  CuMatrix<BaseFloat> out_deriv_col_blocks_sum(out_deriv.NumRows(),
+                                               num_filters);
+
+  // add the col blocks together to out_deriv_col_blocks_sum
+  out_deriv_col_blocks_sum.AddMatBlocks(1.0, out_deriv);
+
+  bias_grad.AddRowSumMat(1.0, out_deriv_col_blocks_sum, 1.0);
+
+  // release memory
+  for (int32 p = 0; p < input_patch_batch.size(); p++) {
+    delete filters_grad_batch[p];
+    delete input_patch_batch[p];
+  }
+
+  //
+  // update
+  //
+  filter_params_.AddMat(learning_rate_, filters_grad);
+  bias_params_.AddVec(learning_rate_, bias_grad);
+}
+
+void ConvolutionComponent::Read(std::istream &is, bool binary) {
+  ReadUpdatableCommon(is, binary);  // Read opening tag and learning rate.
+  ExpectToken(is, binary, "<InputXDim>");
+  ReadBasicType(is, binary, &input_x_dim_);
+  ExpectToken(is, binary, "<InputYDim>");
+  ReadBasicType(is, binary, &input_y_dim_);
+  ExpectToken(is, binary, "<InputZDim>");
+  ReadBasicType(is, binary, &input_z_dim_);
+  ExpectToken(is, binary, "<FiltXDim>");
+  ReadBasicType(is, binary, &filt_x_dim_);
+  ExpectToken(is, binary, "<FiltYDim>");
+  ReadBasicType(is, binary, &filt_y_dim_);
+  ExpectToken(is, binary, "<FiltXStep>");
+  ReadBasicType(is, binary, &filt_x_step_);
+  ExpectToken(is, binary, "<FiltYStep>");
+  ReadBasicType(is, binary, &filt_y_step_);
+  ExpectToken(is, binary, "<InputVectorization>");
+  int32 input_vectorization;
+  ReadBasicType(is, binary, &input_vectorization);
+  input_vectorization_ = static_cast<TensorVectorizationType>(input_vectorization);
+  ExpectToken(is, binary, "<FilterParams>");
+  filter_params_.Read(is, binary);
+  ExpectToken(is, binary, "<BiasParams>");
+  bias_params_.Read(is, binary);
+  std::string tok;
+  ReadToken(is, binary, &tok);
+  if (tok == "<IsGradient>") {
+    ReadBasicType(is, binary, &is_gradient_);
+    ExpectToken(is, binary, "</ConvolutionComponent>");
+  } else {
+    is_gradient_ = false;
+    KALDI_ASSERT(tok == "</ConvolutionComponent>");
+  }
+}
+
+void ConvolutionComponent::Write(std::ostream &os, bool binary) const {
+  WriteUpdatableCommon(os, binary);  // write opening tag and learning rate.
+  WriteToken(os, binary, "<InputXDim>");
+  WriteBasicType(os, binary, input_x_dim_);
+  WriteToken(os, binary, "<InputYDim>");
+  WriteBasicType(os, binary, input_y_dim_);
+  WriteToken(os, binary, "<InputZDim>");
+  WriteBasicType(os, binary, input_z_dim_);
+  WriteToken(os, binary, "<FiltXDim>");
+  WriteBasicType(os, binary, filt_x_dim_);
+  WriteToken(os, binary, "<FiltYDim>");
+  WriteBasicType(os, binary, filt_y_dim_);
+  WriteToken(os, binary, "<FiltXStep>");
+  WriteBasicType(os, binary, filt_x_step_);
+  WriteToken(os, binary, "<FiltYStep>");
+  WriteBasicType(os, binary, filt_y_step_);
+  WriteToken(os, binary, "<InputVectorization>");
+  WriteBasicType(os, binary, static_cast<int32>(input_vectorization_));
+  WriteToken(os, binary, "<FilterParams>");
+  filter_params_.Write(os, binary);
+  WriteToken(os, binary, "<BiasParams>");
+  bias_params_.Write(os, binary);
+  WriteToken(os, binary, "<IsGradient>");
+  WriteBasicType(os, binary, is_gradient_);
+  WriteToken(os, binary, "</ConvolutionComponent>");
+}
+
+BaseFloat ConvolutionComponent::DotProduct(const UpdatableComponent &other_in) const {
+  const ConvolutionComponent *other =
+      dynamic_cast<const ConvolutionComponent*>(&other_in);
+  return TraceMatMat(filter_params_, other->filter_params_, kTrans)
+         + VecVec(bias_params_, other->bias_params_);
+}
+
+Component* ConvolutionComponent::Copy() const {
+  ConvolutionComponent *ans = new ConvolutionComponent(*this);
+  return ans;
+}
+
+void ConvolutionComponent::PerturbParams(BaseFloat stddev) {
+  CuMatrix<BaseFloat> temp_filter_params(filter_params_);
+  temp_filter_params.SetRandn();
+  filter_params_.AddMat(stddev, temp_filter_params);
+
+  CuVector<BaseFloat> temp_bias_params(bias_params_);
+  temp_bias_params.SetRandn();
+  bias_params_.AddVec(stddev, temp_bias_params);
+}
+
+void ConvolutionComponent::SetParams(const VectorBase<BaseFloat> &bias,
+                                     const MatrixBase<BaseFloat> &filter) {
+  bias_params_ = bias;
+  filter_params_ = filter;
+  KALDI_ASSERT(bias_params_.Dim() == filter_params_.NumRows());
+}
+
+int32 ConvolutionComponent::NumParameters() const {
+  return (filter_params_.NumCols() + 1) * filter_params_.NumRows();
+}
+
+void ConvolutionComponent::Vectorize(VectorBase<BaseFloat> *params) const {
+  KALDI_ASSERT(params->Dim() == this->NumParameters());
+  int32 num_filter_params = filter_params_.NumCols() * filter_params_.NumRows();
+  params->Range(0, num_filter_params).CopyRowsFromMat(filter_params_);
+  params->Range(num_filter_params, bias_params_.Dim()).CopyFromVec(bias_params_);
+}
+void ConvolutionComponent::UnVectorize(const VectorBase<BaseFloat> &params) {
+  KALDI_ASSERT(params.Dim() == this->NumParameters());
+  int32 num_filter_params = filter_params_.NumCols() * filter_params_.NumRows();
+  filter_params_.CopyRowsFromVec(params.Range(0, num_filter_params));
+  bias_params_.CopyFromVec(params.Range(num_filter_params, bias_params_.Dim()));
+}
+
+// aquire input dim
+int32 MaxpoolingComponent::InputDim() const {
+  return input_x_dim_ * input_y_dim_ * input_z_dim_;
+}
+
+MaxpoolingComponent::MaxpoolingComponent(
+    const MaxpoolingComponent &component):
+    input_x_dim_(component.input_x_dim_),
+    input_y_dim_(component.input_y_dim_),
+    input_z_dim_(component.input_z_dim_),
+    pool_x_size_(component.pool_x_size_),
+    pool_y_size_(component.pool_y_size_),
+    pool_z_size_(component.pool_z_size_),
+    pool_x_step_(component.pool_x_step_),
+    pool_y_step_(component.pool_y_step_),
+    pool_z_step_(component.pool_z_step_) { }
+
+// aquire output dim
+int32 MaxpoolingComponent::OutputDim() const {
+  int32 num_pools_x = 1 + (input_x_dim_ - pool_x_size_) / pool_x_step_;
+  int32 num_pools_y = 1 + (input_y_dim_ - pool_y_size_) / pool_y_step_;
+  int32 num_pools_z = 1 + (input_z_dim_ - pool_z_size_) / pool_z_step_;
+  return num_pools_x * num_pools_y * num_pools_z;
+}
+
+// check the component parameters
+void MaxpoolingComponent::Check() const {
+  // sanity check of the max pooling parameters
+  KALDI_ASSERT(input_x_dim_ > 0);
+  KALDI_ASSERT(input_y_dim_ > 0);
+  KALDI_ASSERT(input_z_dim_ > 0);
+  KALDI_ASSERT(pool_x_size_ > 0);
+  KALDI_ASSERT(pool_y_size_ > 0);
+  KALDI_ASSERT(pool_z_size_ > 0);
+  KALDI_ASSERT(pool_x_step_ > 0);
+  KALDI_ASSERT(pool_y_step_ > 0);
+  KALDI_ASSERT(pool_z_step_ > 0);
+  KALDI_ASSERT(input_x_dim_ >= pool_x_size_);
+  KALDI_ASSERT(input_y_dim_ >= pool_y_size_);
+  KALDI_ASSERT(input_z_dim_ >= pool_z_size_);
+  KALDI_ASSERT(pool_x_size_ >= pool_x_step_);
+  KALDI_ASSERT(pool_y_size_ >= pool_y_step_);
+  KALDI_ASSERT(pool_z_size_ >= pool_z_step_);
+  KALDI_ASSERT((input_x_dim_ - pool_x_size_) % pool_x_step_  == 0);
+  KALDI_ASSERT((input_y_dim_ - pool_y_size_) % pool_y_step_  == 0);
+  KALDI_ASSERT((input_z_dim_ - pool_z_size_) % pool_z_step_  == 0);
+}
+
+// initialize the component using configuration file
+void MaxpoolingComponent::InitFromConfig(ConfigLine *cfl) {
+  bool ok = true;
+
+  ok = ok && cfl->GetValue("input-x-dim", &input_x_dim_);
+  ok = ok && cfl->GetValue("input-y-dim", &input_y_dim_);
+  ok = ok && cfl->GetValue("input-z-dim", &input_z_dim_);
+  ok = ok && cfl->GetValue("pool-x-size", &pool_x_size_);
+  ok = ok && cfl->GetValue("pool-y-size", &pool_y_size_);
+  ok = ok && cfl->GetValue("pool-z-size", &pool_z_size_);
+  ok = ok && cfl->GetValue("pool-x-step", &pool_x_step_);
+  ok = ok && cfl->GetValue("pool-y-step", &pool_y_step_);
+  ok = ok && cfl->GetValue("pool-z-step", &pool_z_step_);
+
+  if (cfl->HasUnusedValues())
+    KALDI_ERR << "Could not process these elements in initializer: "
+              << cfl->UnusedValues();
+  if (!ok)
+    KALDI_ERR << "Bad initializer " << cfl->WholeLine();
+
+  Check();
+}
+
+// Method to convert from a matrix representing a minibatch of vectorized
+// 3D tensors to patches for 3d max pooling, each patch corresponds to
+// the nodes having the same local coordinatenodes from each pool
+void MaxpoolingComponent::InputToInputPatches(
+    const CuMatrixBase<BaseFloat>& in,
+    CuMatrix<BaseFloat> *patches) const{
+  int32 num_pools_x = 1 + (input_x_dim_ - pool_x_size_) / pool_x_step_;
+  int32 num_pools_y = 1 + (input_y_dim_ - pool_y_size_) / pool_y_step_;
+  int32 num_pools_z = 1 + (input_z_dim_ - pool_z_size_) / pool_z_step_;
+
+  std::vector<int32> column_map(patches->NumCols());
+  int32 column_map_size = column_map.size();
+  for (int32 x = 0, index =0; x < pool_x_size_; x++) {
+    for (int32 y = 0; y < pool_y_size_; y++) {
+      for (int32 z = 0; z < pool_z_size_; z++) {
+        // given the local node coordinate, group them from each pool
+        // to form a patch
+        for (int32 x_pool = 0; x_pool < num_pools_x; x_pool++) {
+          for (int32 y_pool = 0; y_pool < num_pools_y; y_pool++) {
+            for (int32 z_pool = 0; z_pool < num_pools_z; z_pool++, index++) {
+              KALDI_ASSERT(index < column_map_size);
+              column_map[index] = (x_pool * pool_x_step_ + x) * input_y_dim_ * input_z_dim_ +
+                                  (y_pool * pool_y_step_ + y) * input_z_dim_ +
+                                  (z_pool * pool_z_step_ + z);
+
+            }
+          }
+        }
+      }
+    }
+  }
+  CuArray<int32> cu_cols(column_map);
+  patches->CopyCols(in, cu_cols);
+}
+
+/*
+  This is the 3d max pooling propagate function.
+  It is assumed that each row of the input matrix
+  is a vectorized 3D-tensor of type zxy.
+  Similar to the propagate function of ConvolutionComponent,
+  the input matrix is first arranged into patches so that
+  pools (with / without overlapping) could be
+  processed in a parallelizable manner.
+  The output matrix is also a vectorized 3D-tensor of type zxy.
+*/
+
+void* MaxpoolingComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
+                                    const CuMatrixBase<BaseFloat> &in,
+                                    CuMatrixBase<BaseFloat> *out) const {
+  int32 num_frames = in.NumRows();
+  int32 num_pools = OutputDim();
+  int32 pool_size = pool_x_size_ * pool_y_size_ * pool_z_size_;
+  CuMatrix<BaseFloat> patches(num_frames, num_pools * pool_size, kUndefined);
+  InputToInputPatches(in, &patches);
+
+  out->Set(-1e20); // reset a large negative value
+  for (int32 q = 0; q < pool_size; q++)
+    out->Max(patches.ColRange(q * num_pools, num_pools));
+  return NULL;
+}
+
+// Method to compute the input derivative matrix from the input derivatives
+// for patches, where each patch corresponds to
+// the nodes having the same local coordinatenodes from each pool
+void MaxpoolingComponent::InderivPatchesToInderiv(
+    const CuMatrix<BaseFloat>& in_deriv_patches,
+    CuMatrixBase<BaseFloat> *in_deriv) const {
+
+  int32 num_pools_x = 1 + (input_x_dim_ - pool_x_size_) / pool_x_step_;
+  int32 num_pools_y = 1 + (input_y_dim_ - pool_y_size_) / pool_y_step_;
+  int32 num_pools_z = 1 + (input_z_dim_ - pool_z_size_) / pool_z_step_;
+
+  std::vector<std::vector<int32> > reverse_column_map(in_deriv->NumCols());
+  int32 rev_col_map_size = reverse_column_map.size();
+  for (int32 x = 0, index = 0; x < pool_x_size_; x++) {
+    for (int32 y = 0; y < pool_y_size_; y++) {
+      for (int32 z = 0; z < pool_z_size_; z++) {
+
+        for (int32 x_pool = 0; x_pool < num_pools_x; x_pool++) {
+          for (int32 y_pool = 0; y_pool < num_pools_y; y_pool++) {
+            for (int32 z_pool = 0; z_pool < num_pools_z; z_pool++, index++) {
+              int32 vector_index = (x_pool * pool_x_step_ + x) * input_y_dim_ * input_z_dim_ +
+                                  (y_pool * pool_y_step_ + y) * input_z_dim_ +
+                                  (z_pool * pool_z_step_ + z);
+
+              KALDI_ASSERT(vector_index < rev_col_map_size);
+              reverse_column_map[vector_index].push_back(index);
+            }
+          }
+        }
+      }
+    }
+  }
+  std::vector<std::vector<int32> > rearranged_column_map;
+  RearrangeIndexes(reverse_column_map, &rearranged_column_map);
+  for (int32 p = 0; p < rearranged_column_map.size(); p++) {
+    CuArray<int32> cu_cols(rearranged_column_map[p]);
+    in_deriv->AddCols(in_deriv_patches, cu_cols);
+  }
+}
+
+/*
+  3d max pooling backpropagate function
+  This function backpropagate the error from
+  out_deriv to in_deriv.
+  In order to select the node in each pool to
+  backpropagate the error, it has to compare
+  the output pool value stored in the out_value
+  matrix with each of its input pool member node
+  stroed in the in_value matrix.
+*/
+void MaxpoolingComponent::Backprop(const std::string &debug_info,
+                                   const ComponentPrecomputedIndexes *indexes,
+                                   const CuMatrixBase<BaseFloat> &in_value,
+                                   const CuMatrixBase<BaseFloat> &out_value,
+                                   const CuMatrixBase<BaseFloat> &out_deriv,
+                                   void *memo,
+                                   Component *, // to_update,
+                                   CuMatrixBase<BaseFloat> *in_deriv) const {
+  if (!in_deriv)
+    return;
+
+  int32 num_frames = in_value.NumRows();
+  int32 num_pools = OutputDim();
+  int32 pool_size = pool_x_size_ * pool_y_size_ * pool_z_size_;
+  CuMatrix<BaseFloat> patches(num_frames, num_pools * pool_size, kUndefined);
+  InputToInputPatches(in_value, &patches);
+
+  for (int32 q = 0; q < pool_size; q++) {
+    // zero-out mask
+    CuMatrix<BaseFloat> mask;
+    out_value.EqualElementMask(patches.ColRange(q * num_pools, num_pools), &mask);
+    mask.MulElements(out_deriv);
+    patches.ColRange(q * num_pools, num_pools).CopyFromMat(mask);
+  }
+
+  // combine the derivatives from the individual input deriv patches
+  // to compute input deriv matrix
+  InderivPatchesToInderiv(patches, in_deriv);
+}
+
+void MaxpoolingComponent::Read(std::istream &is, bool binary) {
+  ExpectOneOrTwoTokens(is, binary, "<MaxpoolingComponent>", "<InputXDim>");
+  ReadBasicType(is, binary, &input_x_dim_);
+  ExpectToken(is, binary, "<InputYDim>");
+  ReadBasicType(is, binary, &input_y_dim_);
+  ExpectToken(is, binary, "<InputZDim>");
+  ReadBasicType(is, binary, &input_z_dim_);
+  ExpectToken(is, binary, "<PoolXSize>");
+  ReadBasicType(is, binary, &pool_x_size_);
+  ExpectToken(is, binary, "<PoolYSize>");
+  ReadBasicType(is, binary, &pool_y_size_);
+  ExpectToken(is, binary, "<PoolZSize>");
+  ReadBasicType(is, binary, &pool_z_size_);
+  ExpectToken(is, binary, "<PoolXStep>");
+  ReadBasicType(is, binary, &pool_x_step_);
+  ExpectToken(is, binary, "<PoolYStep>");
+  ReadBasicType(is, binary, &pool_y_step_);
+  ExpectToken(is, binary, "<PoolZStep>");
+  ReadBasicType(is, binary, &pool_z_step_);
+  ExpectToken(is, binary, "</MaxpoolingComponent>");
+  Check();
+}
+
+void MaxpoolingComponent::Write(std::ostream &os, bool binary) const {
+  WriteToken(os, binary, "<MaxpoolingComponent>");
+  WriteToken(os, binary, "<InputXDim>");
+  WriteBasicType(os, binary, input_x_dim_);
+  WriteToken(os, binary, "<InputYDim>");
+  WriteBasicType(os, binary, input_y_dim_);
+  WriteToken(os, binary, "<InputZDim>");
+  WriteBasicType(os, binary, input_z_dim_);
+  WriteToken(os, binary, "<PoolXSize>");
+  WriteBasicType(os, binary, pool_x_size_);
+  WriteToken(os, binary, "<PoolYSize>");
+  WriteBasicType(os, binary, pool_y_size_);
+  WriteToken(os, binary, "<PoolZSize>");
+  WriteBasicType(os, binary, pool_z_size_);
+  WriteToken(os, binary, "<PoolXStep>");
+  WriteBasicType(os, binary, pool_x_step_);
+  WriteToken(os, binary, "<PoolYStep>");
+  WriteBasicType(os, binary, pool_y_step_);
+  WriteToken(os, binary, "<PoolZStep>");
+  WriteBasicType(os, binary, pool_z_step_);
+  WriteToken(os, binary, "</MaxpoolingComponent>");
+}
+
+// display information about component
+std::string MaxpoolingComponent::Info() const {
+  std::ostringstream stream;
+  stream << Type()
+         << ", input-x-dim=" << input_x_dim_
+         << ", input-y-dim=" << input_y_dim_
+         << ", input-z-dim=" << input_z_dim_
+         << ", pool-x-size=" << pool_x_size_
+         << ", pool-y-size=" << pool_y_size_
+         << ", pool-z-size=" << pool_z_size_
+         << ", pool-x-step=" << pool_x_step_
+         << ", pool-y-step=" << pool_y_step_
+         << ", pool-z-step=" << pool_z_step_;
+  return stream.str();
+}
+
+
+int32 LstmNonlinearityComponent::InputDim() const {
+  int32 cell_dim = value_sum_.NumCols();
+  return cell_dim * 5 + (use_dropout_ ? 3 : 0);
+}
+
+int32 LstmNonlinearityComponent::OutputDim() const {
+  int32 cell_dim = value_sum_.NumCols();
+  return cell_dim * 2;
+}
+
+
+void LstmNonlinearityComponent::Read(std::istream &is, bool binary) {
+  ReadUpdatableCommon(is, binary);  // Read opening tag and learning rate.
+  ExpectToken(is, binary, "<Params>");
+  params_.Read(is, binary);
+  ExpectToken(is, binary, "<ValueAvg>");
+  value_sum_.Read(is, binary);
+  ExpectToken(is, binary, "<DerivAvg>");
+  deriv_sum_.Read(is, binary);
+  ExpectToken(is, binary, "<SelfRepairConfig>");
+  self_repair_config_.Read(is, binary);
+  ExpectToken(is, binary, "<SelfRepairProb>");
+  self_repair_total_.Read(is, binary);
+
+  std::string tok;
+  ReadToken(is, binary, &tok);
+  if (tok == "<UseDropout>") {
+    ReadBasicType(is, binary, &use_dropout_);
+    ReadToken(is, binary, &tok);
+  } else {
+    use_dropout_ = false;
+  }
+  KALDI_ASSERT(tok == "<Count>");
+  ReadBasicType(is, binary, &count_);
+
+  // For the on-disk format, we normalze value_sum_, deriv_sum_ and
+  // self_repair_total_ by dividing by the count, but in memory they are scaled
+  // by the count.  [for self_repair_total_, the scaling factor is count_ *
+  // cell_dim].
+  value_sum_.Scale(count_);
+  deriv_sum_.Scale(count_);
+  int32 cell_dim = params_.NumCols();
+  self_repair_total_.Scale(count_ * cell_dim);
+
+  InitNaturalGradient();
+
+  ExpectToken(is, binary, "</LstmNonlinearityComponent>");
+
+}
+
+void LstmNonlinearityComponent::Write(std::ostream &os, bool binary) const {
+  WriteUpdatableCommon(os, binary);  // Read opening tag and learning rate.
+
+  WriteToken(os, binary, "<Params>");
+  params_.Write(os, binary);
+  WriteToken(os, binary, "<ValueAvg>");
+  {
+    Matrix<BaseFloat> value_avg(value_sum_);
+    if (count_ != 0.0)
+      value_avg.Scale(1.0 / count_);
+    value_avg.Write(os, binary);
+  }
+  WriteToken(os, binary, "<DerivAvg>");
+  {
+    Matrix<BaseFloat> deriv_avg(deriv_sum_);
+    if (count_ != 0.0)
+      deriv_avg.Scale(1.0 / count_);
+    deriv_avg.Write(os, binary);
+  }
+  WriteToken(os, binary, "<SelfRepairConfig>");
+  self_repair_config_.Write(os, binary);
+  WriteToken(os, binary, "<SelfRepairProb>");
+  {
+    int32 cell_dim = params_.NumCols();
+    Vector<BaseFloat> self_repair_prob(self_repair_total_);
+    if (count_ != 0.0)
+      self_repair_prob.Scale(1.0 / (count_ * cell_dim));
+    self_repair_prob.Write(os, binary);
+  }
+  if (use_dropout_) {
+    // only write this if true; we have back-compat code in reading anyway.
+    // this makes the models without dropout easier to read with older code.
+    WriteToken(os, binary, "<UseDropout>");
+    WriteBasicType(os, binary, use_dropout_);
+  }
+  WriteToken(os, binary, "<Count>");
+  WriteBasicType(os, binary, count_);
+  WriteToken(os, binary, "</LstmNonlinearityComponent>");
+}
+
+
+
+std::string LstmNonlinearityComponent::Info() const {
+  std::ostringstream stream;
+  int32 cell_dim = params_.NumCols();
+  stream << UpdatableComponent::Info() << ", cell-dim=" << cell_dim
+         << ", use-dropout=" << (use_dropout_ ? "true" : "false");
+  PrintParameterStats(stream, "w_ic", params_.Row(0));
+  PrintParameterStats(stream, "w_fc", params_.Row(1));
+  PrintParameterStats(stream, "w_oc", params_.Row(2));
+
+  // Note: some of the following code mirrors the code in
+  // UpdatableComponent::Info(), in nnet-component-itf.cc.
+  if (count_ > 0) {
+    stream << ", count=" << std::setprecision(3) << count_
+           << std::setprecision(6);
+  }
+  static const char *nonlin_names[] = { "i_t_sigmoid", "f_t_sigmoid", "c_t_tanh",
+                                        "o_t_sigmoid", "m_t_tanh" };
+  for (int32 i = 0; i < 5; i++) {
+    stream << ", " << nonlin_names[i] << "={";
+    stream << " self-repair-lower-threshold=" << self_repair_config_(i)
+           << ", self-repair-scale=" << self_repair_config_(i + 5);
+
+    if (count_ != 0) {
+      BaseFloat self_repaired_proportion =
+          self_repair_total_(i) / (count_ * cell_dim);
+      stream << ", self-repaired-proportion=" << self_repaired_proportion;
+      Vector<double> value_sum(value_sum_.Row(i)),
+          deriv_sum(deriv_sum_.Row(i));
+      Vector<BaseFloat> value_avg(value_sum), deriv_avg(deriv_sum);
+      value_avg.Scale(1.0 / count_);
+      deriv_avg.Scale(1.0 / count_);
+      stream << ", value-avg=" << SummarizeVector(value_avg)
+             << ", deriv-avg=" << SummarizeVector(deriv_avg);
+    }
+    stream << " }";
+  }
+  return stream.str();
+}
+
+
+Component* LstmNonlinearityComponent::Copy() const {
+  return new LstmNonlinearityComponent(*this);
+}
+
+void LstmNonlinearityComponent::ZeroStats() {
+  value_sum_.SetZero();
+  deriv_sum_.SetZero();
+  self_repair_total_.SetZero();
+  count_ = 0.0;
+}
+
+void LstmNonlinearityComponent::Scale(BaseFloat scale) {
+  if (scale == 0.0) {
+    params_.SetZero();
+    value_sum_.SetZero();
+    deriv_sum_.SetZero();
+    self_repair_total_.SetZero();
+    count_ = 0.0;
+  } else {
+    params_.Scale(scale);
+    value_sum_.Scale(scale);
+    deriv_sum_.Scale(scale);
+    self_repair_total_.Scale(scale);
+    count_ *= scale;
+  }
+}
+
+void LstmNonlinearityComponent::Add(BaseFloat alpha,
+                                    const Component &other_in) {
+  const LstmNonlinearityComponent *other =
+      dynamic_cast<const LstmNonlinearityComponent*>(&other_in);
+  KALDI_ASSERT(other != NULL);
+  params_.AddMat(alpha, other->params_);
+  value_sum_.AddMat(alpha, other->value_sum_);
+  deriv_sum_.AddMat(alpha, other->deriv_sum_);
+  self_repair_total_.AddVec(alpha, other->self_repair_total_);
+  count_ += alpha * other->count_;
+}
+
+void LstmNonlinearityComponent::PerturbParams(BaseFloat stddev) {
+  CuMatrix<BaseFloat> temp_params(params_.NumRows(), params_.NumCols());
+  temp_params.SetRandn();
+  params_.AddMat(stddev, temp_params);
+}
+
+BaseFloat LstmNonlinearityComponent::DotProduct(
+    const UpdatableComponent &other_in) const {
+  const LstmNonlinearityComponent *other =
+      dynamic_cast<const LstmNonlinearityComponent*>(&other_in);
+  KALDI_ASSERT(other != NULL);
+  return TraceMatMat(params_, other->params_, kTrans);
+}
+
+int32 LstmNonlinearityComponent::NumParameters() const {
+  return params_.NumRows() * params_.NumCols();
+}
+
+void LstmNonlinearityComponent::Vectorize(VectorBase<BaseFloat> *params) const {
+  KALDI_ASSERT(params->Dim() == NumParameters());
+  params->CopyRowsFromMat(params_);
+}
+
+
+void LstmNonlinearityComponent::UnVectorize(
+    const VectorBase<BaseFloat> &params)  {
+  KALDI_ASSERT(params.Dim() == NumParameters());
+  params_.CopyRowsFromVec(params);
+}
+
+
+void* LstmNonlinearityComponent::Propagate(
+    const ComponentPrecomputedIndexes *, // indexes
+    const CuMatrixBase<BaseFloat> &in,
+    CuMatrixBase<BaseFloat> *out) const {
+  cu::ComputeLstmNonlinearity(in, params_, out);
+  return NULL;
+}
+
+
+void LstmNonlinearityComponent::Backprop(
+    const std::string &debug_info,
+    const ComponentPrecomputedIndexes *indexes,
+    const CuMatrixBase<BaseFloat> &in_value,
+    const CuMatrixBase<BaseFloat> &, // out_value,
+    const CuMatrixBase<BaseFloat> &out_deriv,
+    void *memo,
+    Component *to_update_in,
+    CuMatrixBase<BaseFloat> *in_deriv) const {
+
+  if (to_update_in == NULL) {
+    cu::BackpropLstmNonlinearity(in_value, params_, out_deriv,
+                                 deriv_sum_, self_repair_config_,
+                                 count_, in_deriv,
+                                 (CuMatrixBase<BaseFloat>*) NULL,
+                                 (CuMatrixBase<double>*) NULL,
+                                 (CuMatrixBase<double>*) NULL,
+                                 (CuMatrixBase<BaseFloat>*) NULL);
+  } else {
+    LstmNonlinearityComponent *to_update =
+        dynamic_cast<LstmNonlinearityComponent*>(to_update_in);
+    KALDI_ASSERT(to_update != NULL);
+
+    int32 cell_dim = params_.NumCols();
+    CuMatrix<BaseFloat> params_deriv(3, cell_dim, kUndefined);
+    CuMatrix<BaseFloat> self_repair_total(5, cell_dim, kUndefined);
+
+    cu::BackpropLstmNonlinearity(in_value, params_, out_deriv,
+                                 deriv_sum_, self_repair_config_,
+                                 count_, in_deriv, &params_deriv,
+                                 &(to_update->value_sum_),
+                                 &(to_update->deriv_sum_),
+                                 &self_repair_total);
+
+    CuVector<BaseFloat> self_repair_total_sum(5);
+    self_repair_total_sum.AddColSumMat(1.0, self_repair_total, 0.0);
+    to_update->self_repair_total_.AddVec(1.0, self_repair_total_sum);
+    to_update->count_ += static_cast<double>(in_value.NumRows());
+
+    BaseFloat scale = 1.0;
+    if (!to_update->is_gradient_) {
+      to_update->preconditioner_.PreconditionDirections(
+          &params_deriv, &scale);
+    }
+    to_update->params_.AddMat(to_update->learning_rate_ * scale,
+                              params_deriv);
+  }
+}
+
+LstmNonlinearityComponent::LstmNonlinearityComponent(
+    const LstmNonlinearityComponent &other):
+    UpdatableComponent(other),
+    params_(other.params_),
+    use_dropout_(other.use_dropout_),
+    value_sum_(other.value_sum_),
+    deriv_sum_(other.deriv_sum_),
+    self_repair_config_(other.self_repair_config_),
+    self_repair_total_(other.self_repair_total_),
+    count_(other.count_),
+    preconditioner_(other.preconditioner_) { }
+
+void LstmNonlinearityComponent::Init(
+    int32 cell_dim, bool use_dropout,
+    BaseFloat param_stddev,
+    BaseFloat tanh_self_repair_threshold,
+    BaseFloat sigmoid_self_repair_threshold,
+    BaseFloat self_repair_scale) {
+  KALDI_ASSERT(cell_dim > 0 && param_stddev >= 0.0 &&
+               tanh_self_repair_threshold >= 0.0 &&
+               tanh_self_repair_threshold <= 1.0 &&
+               sigmoid_self_repair_threshold >= 0.0 &&
+               sigmoid_self_repair_threshold <= 0.25 &&
+               self_repair_scale >= 0.0 && self_repair_scale <= 0.1);
+  use_dropout_ = use_dropout;
+  params_.Resize(3, cell_dim);
+  params_.SetRandn();
+  params_.Scale(param_stddev);
+  value_sum_.Resize(5, cell_dim);
+  deriv_sum_.Resize(5, cell_dim);
+  self_repair_config_.Resize(10);
+  self_repair_config_.Range(0, 5).Set(sigmoid_self_repair_threshold);
+  self_repair_config_(2) = tanh_self_repair_threshold;
+  self_repair_config_(4) = tanh_self_repair_threshold;
+  self_repair_config_.Range(5, 5).Set(self_repair_scale);
+  self_repair_total_.Resize(5);
+  count_ = 0.0;
+  InitNaturalGradient();
+
+}
+
+void LstmNonlinearityComponent::InitNaturalGradient() {
+  // As regards the configuration for the natural-gradient preconditioner, we
+  // don't make it configurable from the command line-- it's unlikely that any
+  // differences from changing this would be substantial enough to effectively
+  // tune the configuration.  Because the preconditioning code doesn't 'see' the
+  // derivatives from individual frames, but only averages over the minibatch,
+  // there is a fairly small amount of data available to estimate the Fisher
+  // information matrix, so we set the rank, update period and
+  // num-samples-history to smaller values than normal.
+  preconditioner_.SetRank(20);
+  preconditioner_.SetUpdatePeriod(2);
+  preconditioner_.SetNumSamplesHistory(1000.0);
+}
+
+/// virtual
+void LstmNonlinearityComponent::FreezeNaturalGradient(bool freeze) {
+  preconditioner_.Freeze(freeze);
+}
+
+void LstmNonlinearityComponent::InitFromConfig(ConfigLine *cfl) {
+  InitLearningRatesFromConfig(cfl);
+  bool ok = true;
+  bool use_dropout = false;
+  int32 cell_dim;
+  // these self-repair thresholds are the normal defaults for tanh and sigmoid
+  // respectively.  If, later on, we decide that we want to support different
+  // self-repair config values for the individual sigmoid and tanh
+  // nonlinearities, we can modify this code then.
+  BaseFloat tanh_self_repair_threshold = 0.2,
+      sigmoid_self_repair_threshold = 0.05,
+      self_repair_scale = 1.0e-05;
+  // param_stddev is the stddev of the parameters.  it may be better to
+  // use a smaller value but this was the default in the python scripts
+  // for a while.
+  BaseFloat param_stddev = 1.0;
+  ok = ok && cfl->GetValue("cell-dim", &cell_dim);
+  cfl->GetValue("param-stddev", &param_stddev);
+  cfl->GetValue("tanh-self-repair-threshold",
+                &tanh_self_repair_threshold);
+  cfl->GetValue("sigmoid-self-repair-threshold",
+                &sigmoid_self_repair_threshold);
+  cfl->GetValue("self-repair-scale", &self_repair_scale);
+  cfl->GetValue("use-dropout", &use_dropout);
+
+  // We may later on want to make it possible to initialize the different
+  // parameters w_ic, w_fc and w_oc with different biases.  We'll implement
+  // that when and if it's needed.
+
+  if (cfl->HasUnusedValues())
+    KALDI_ERR << "Could not process these elements in initializer: "
+              << cfl->UnusedValues();
+  if (ok) {
+    Init(cell_dim, use_dropout, param_stddev, tanh_self_repair_threshold,
+         sigmoid_self_repair_threshold, self_repair_scale);
+  } else {
+    KALDI_ERR << "Invalid initializer for layer of type "
+              << Type() << ": \"" << cfl->WholeLine() << "\"";
+  }
+}
+
+void LstmNonlinearityComponent::ConsolidateMemory() {
+  OnlineNaturalGradient preconditioner_temp(preconditioner_);
+  preconditioner_.Swap(&preconditioner_);
+}
+
+
+int32 GruNonlinearityComponent::InputDim() const {
+  if (recurrent_dim_ == cell_dim_) {
+    // non-projected GRU.
+    return 4 * cell_dim_;
+  } else {
+    return 3 * cell_dim_ + 2 * recurrent_dim_;
+  }
+}
+
+int32 GruNonlinearityComponent::OutputDim() const {
+  return 2 * cell_dim_;
+}
+
+
+std::string GruNonlinearityComponent::Info() const {
+  std::ostringstream stream;
+  stream << UpdatableComponent::Info()
+         << ", cell-dim=" << cell_dim_
+         << ", recurrent-dim=" << recurrent_dim_;
+  PrintParameterStats(stream, "w_h", w_h_);
+  stream << ", self-repair-threshold=" << self_repair_threshold_
+         << ", self-repair-scale=" << self_repair_scale_;
+  if (count_ > 0) {  // c.f. NonlinearComponent::Info().
+    stream << ", count=" << std::setprecision(3) << count_
+           << std::setprecision(6);
+    stream << ", self-repaired-proportion="
+           << (self_repair_total_ / (count_ * cell_dim_));
+    Vector<double> value_avg_dbl(value_sum_);
+    Vector<BaseFloat> value_avg(value_avg_dbl);
+    value_avg.Scale(1.0 / count_);
+    stream << ", value-avg=" << SummarizeVector(value_avg);
+    Vector<double> deriv_avg_dbl(deriv_sum_);
+    Vector<BaseFloat> deriv_avg(deriv_avg_dbl);
+    deriv_avg.Scale(1.0 / count_);
+    stream << ", deriv-avg=" << SummarizeVector(deriv_avg);
+  }
+  // natural-gradient parameters.
+  stream << ", alpha=" << preconditioner_in_.GetAlpha()
+         << ", rank-in=" << preconditioner_in_.GetRank()
+         << ", rank-out=" << preconditioner_out_.GetRank()
+         << ", update-period="
+         << preconditioner_in_.GetUpdatePeriod();
+  return stream.str();
+}
+
+void GruNonlinearityComponent::InitFromConfig(ConfigLine *cfl) {
+  cell_dim_ = -1;
+  recurrent_dim_ = -1;
+  self_repair_threshold_ = 0.2;
+  self_repair_scale_ = 1.0e-05;
+
+  InitLearningRatesFromConfig(cfl);
+  if (!cfl->GetValue("cell-dim", &cell_dim_) || cell_dim_ <= 0)
+    KALDI_ERR << "cell-dim > 0 is required for GruNonlinearityComponent.";
+
+  BaseFloat param_stddev = 1.0 / std::sqrt(cell_dim_),
+      alpha = 4.0;
+  int32 rank_in = 20, rank_out = 80,
+      update_period = 4;
+
+  cfl->GetValue("recurrent-dim", &recurrent_dim_);
+  cfl->GetValue("self-repair-threshold", &self_repair_threshold_);
+  cfl->GetValue("self-repair-scale", &self_repair_scale_);
+  cfl->GetValue("param-stddev", &param_stddev);
+  cfl->GetValue("alpha", &alpha);
+  cfl->GetValue("rank-in", &rank_in);
+  cfl->GetValue("rank-out", &rank_out);
+  cfl->GetValue("update-period", &update_period);
+
+  if (recurrent_dim_ < 0)
+    recurrent_dim_ = cell_dim_;
+  if (recurrent_dim_ == 0 || recurrent_dim_ > cell_dim_)
+    KALDI_ERR << "Invalid values for cell-dim and recurrent-dim";
+
+  w_h_.Resize(cell_dim_, recurrent_dim_);
+  w_h_.SetRandn();
+  w_h_.Scale(param_stddev);
+
+  preconditioner_in_.SetAlpha(alpha);
+  preconditioner_in_.SetRank(rank_in);
+  preconditioner_in_.SetUpdatePeriod(update_period);
+  preconditioner_out_.SetAlpha(alpha);
+  preconditioner_out_.SetRank(rank_out);
+  preconditioner_out_.SetUpdatePeriod(update_period);
+
+  count_ = 0.0;
+  self_repair_total_ = 0.0;
+  value_sum_.Resize(cell_dim_);
+  deriv_sum_.Resize(cell_dim_);
+
+  Check();
+}
+
+void* GruNonlinearityComponent::Propagate(
+    const ComponentPrecomputedIndexes *indexes,
+    const CuMatrixBase<BaseFloat> &in,
+    CuMatrixBase<BaseFloat> *out) const {
+  KALDI_ASSERT(in.NumRows() == out->NumRows() &&
+               in.NumCols() == InputDim() &&
+               out->NumCols() == OutputDim());
+  // If recurrent_dim_ != cell_dim_, this is projected GRU and we
+  // are computing:
+  //  (z_t, r_t, hpart_t, c_{t-1}, s_{t-1}) -> (h_t, c_t).
+  // Otherwise (no projection), it's
+  //  (z_t, r_t, hpart_t, y_{t-1},) -> (h_t, y_t).
+  // but to understand this code, it's better to rename y to c:
+  //  (z_t, r_t, hpart_t, c_{t-1}) -> (h_t, c_t).
+  int32 num_rows = in.NumRows(),
+      c = cell_dim_,
+      r =  recurrent_dim_;
+  CuSubMatrix<BaseFloat> z_t(in, 0, num_rows, 0, c),
+      r_t(in, 0, num_rows, c, r),
+      hpart_t(in, 0, num_rows, c + r, c),
+      c_t1(in, 0, num_rows, c + r + c, c);
+  // note: the variable named 'c_t1' actually represents
+  // y_{t-1} for non-projected GRUs.
+
+  // By setting s_t1 to the last recurrent_dim_ rows of 'in', we get something
+  // that represents s_{t-1} for recurrent setups and y_{t-1} (which we're
+  // renaming to c_{t-1}) for non-projected GRUs.  The key thing is that
+  // in the non-projected case, the variables c_t1 and s_t1 point to the
+  // same memory.
+  CuSubMatrix<BaseFloat> s_t1(in, 0, num_rows, in.NumCols() - r, r);
+
+  // note: for non-projected GRUs, c_t below is actually y_t.
+  CuSubMatrix<BaseFloat> h_t(*out, 0, num_rows, 0, c),
+      c_t(*out, 0, num_rows, c, c);
+
+  // sdotr is the only temporary storage we need in the forward pass.
+  CuMatrix<BaseFloat> sdotr(num_rows, r);
+  sdotr.AddMatMatElements(1.0, r_t, s_t1, 0.0);
+  // now sdotr = r_t \dot s_{t-1}.
+  h_t.CopyFromMat(hpart_t);
+  // now h_t = hpart_t (note: hpart_t actually means U^h x_t).
+  h_t.AddMatMat(1.0, sdotr, kNoTrans, w_h_, kTrans, 1.0);
+  // now h_t = hpart_t + W^h (s_{t-1} \dot r_t).
+  h_t.Tanh(h_t);
+  // now, h_t = tanh(hpart_t + W^h (s_{t-1} \dot r_t)).
+
+  c_t.CopyFromMat(h_t);
+  // now c_t = h_t
+  c_t.AddMatMatElements(-1.0, z_t, h_t, 1.0);
+  // now c_t = (1 - z_t) \dot h_t.
+  c_t.AddMatMatElements(1.0, z_t, c_t1, 1.0);
+  // now c_t = (1 - z_t) \dot h_t  +  z_t \dot c_{t-1}.
+  return NULL;
+}
+
+void GruNonlinearityComponent::Backprop(
+    const std::string &debug_info,
+    const ComponentPrecomputedIndexes *, // indexes
+    const CuMatrixBase<BaseFloat> &in_value,
+    const CuMatrixBase<BaseFloat> &out_value,
+    const CuMatrixBase<BaseFloat> &out_deriv,
+    void *memo,
+    Component *to_update_in,
+    CuMatrixBase<BaseFloat> *in_deriv) const {
+  KALDI_ASSERT(SameDim(out_value, out_deriv) &&
+               in_value.NumRows() == out_value.NumRows() &&
+               in_value.NumCols() == InputDim() &&
+               out_value.NumCols() == OutputDim() &&
+               (in_deriv == NULL || SameDim(in_value, *in_deriv)) &&
+               memo == NULL);
+  GruNonlinearityComponent *to_update =
+      dynamic_cast<GruNonlinearityComponent*>(to_update_in);
+  KALDI_ASSERT(in_deriv != NULL || to_update != NULL);
+  int32 num_rows = in_value.NumRows(),
+      c = cell_dim_,
+      r = recurrent_dim_;
+
+  // To understand what's going on here, compare this code with the
+  // corresponding 'forward' code in Propagate().
+
+
+  CuSubMatrix<BaseFloat> z_t(in_value, 0, num_rows, 0, c),
+      r_t(in_value, 0, num_rows, c, r),
+      hpart_t(in_value, 0, num_rows, c + r, c),
+      c_t1(in_value, 0, num_rows, c + r + c, c),
+      s_t1(in_value, 0, num_rows, in_value.NumCols() - r, r);
+
+
+  // The purpose of this 'in_deriv_ptr' is so that we can create submatrices
+  // like z_t_deriv without the code crashing.  If in_deriv is NULL these point
+  // to 'in_value', and we'll be careful never to actually write to these
+  // sub-matrices, which aside from being conceptually wrong would violate the
+  // const semantics of this function.
+  const CuMatrixBase<BaseFloat> *in_deriv_ptr =
+      (in_deriv == NULL ? &in_value : in_deriv);
+  CuSubMatrix<BaseFloat> z_t_deriv(*in_deriv_ptr, 0, num_rows, 0, c),
+      r_t_deriv(*in_deriv_ptr, 0, num_rows, c, r),
+      hpart_t_deriv(*in_deriv_ptr, 0, num_rows, c + r, c),
+      c_t1_deriv(*in_deriv_ptr, 0, num_rows, c + r + c, c),
+      s_t1_deriv(*in_deriv_ptr, 0, num_rows, in_value.NumCols() - r, r);
+
+  // Note: the output h_t is never actually used in the GRU computation (we only
+  // output it because we want the value to be cached to save computation in the
+  // backprop), so we expect that the 'h_t_deriv', if we extracted it in the
+  // obvious way, would be all zeros.
+  // We create a different, local h_t_deriv
+  // variable that backpropagates the derivative from c_t_deriv.
+  CuSubMatrix<BaseFloat> h_t(out_value, 0, num_rows, 0, c),
+      c_t(out_value, 0, num_rows, c, c),
+      c_t_deriv(out_deriv, 0, num_rows, c, c);
+  CuMatrix<BaseFloat> h_t_deriv(num_rows, c, kUndefined);
+
+  {  // we initialize h_t_deriv with the derivative from 'out_deriv'.
+    // In real life in a GRU, this would always be zero; but in testing
+    // code it may be nonzero and we include this term so that
+    // the tests don't fail.  Note: if you were to remove these
+    // lines, you'd have to change 'h_t_deriv.AddMat(1.0, c_t_deriv);' below
+    // to a CopyFromMat() call.
+    CuSubMatrix<BaseFloat> h_t_deriv_in(out_deriv, 0, num_rows, 0, c);
+    h_t_deriv.CopyFromMat(h_t_deriv_in);
+  }
+
+
+  // sdotr is the same variable as used in the forward pass, it will contain
+  // r_t \dot s_{t-1}.
+  CuMatrix<BaseFloat> sdotr(num_rows, r);
+  sdotr.AddMatMatElements(1.0, r_t, s_t1, 0.0);
+
+
+  { // This block does the
+    // backprop corresponding to the
+    // forward-pass expression: c_t = (1 - z_t) \dot h_t + z_t \dot c_{t-1}.
+
+    // First do: h_t_deriv = c_t_deriv \dot (1 - z_t).
+    h_t_deriv.AddMat(1.0, c_t_deriv);
+    h_t_deriv.AddMatMatElements(-1.0, c_t_deriv, z_t, 1.0);
+
+    if (in_deriv) {
+      // these should be self-explanatory if you study
+      // the expression "c_t = (1 - z_t) \dot h_t + z_t \dot c_{t-1}".
+      z_t_deriv.AddMatMatElements(-1.0, c_t_deriv, h_t, 1.0);
+      z_t_deriv.AddMatMatElements(1.0, c_t_deriv, c_t1, 1.0);
+      c_t1_deriv.AddMatMatElements(1.0, c_t_deriv, z_t, 1.0);
+    }
+  }
+
+  h_t_deriv.DiffTanh(h_t, h_t_deriv);
+  if (to_update)
+    to_update->TanhStatsAndSelfRepair(h_t, &h_t_deriv);
+
+
+  if (to_update)
+    to_update->UpdateParameters(sdotr, h_t_deriv);
+
+  // At this point, 'h_t_deriv' contains the derivative w.r.t.
+  // the argument of the tanh function, i.e. w.r.t. the expression:
+  //    hpart_t + W^h (s_{t-1} \dot r_t).
+  // The next block propagates this to the derivatives for
+  // hpart_t, s_{t-1} and r_t.
+  if (in_deriv) {
+    hpart_t_deriv.AddMat(1.0, h_t_deriv);
+
+    // We re-use the memory that we used for s_{t-1} \dot r_t,
+    // for its derivative.
+    CuMatrix<BaseFloat> &sdotr_deriv(sdotr);
+    sdotr_deriv.AddMatMat(1.0, h_t_deriv, kNoTrans, w_h_, kNoTrans, 0.0);
+
+    // we add to all the input-derivatives instead of setting them,
+    // because we chose to export the flag kBackpropAdds.
+    r_t_deriv.AddMatMatElements(1.0, sdotr_deriv, s_t1, 1.0);
+    s_t1_deriv.AddMatMatElements(1.0, sdotr_deriv, r_t, 1.0);
+  }
+}
+
+
+void GruNonlinearityComponent::TanhStatsAndSelfRepair(
+    const CuMatrixBase<BaseFloat> &h_t,
+    CuMatrixBase<BaseFloat> *h_t_deriv) {
+  KALDI_ASSERT(SameDim(h_t, *h_t_deriv));
+
+  // we use this probability (hardcoded for now) to limit the stats accumulation
+  // and self-repair code to running on about half of the minibatches.
+  BaseFloat repair_and_stats_probability = 0.5;
+  if (RandUniform() > repair_and_stats_probability)
+    return;
+
+  // OK, accumulate stats.
+  // For the next few lines, compare with TanhComponent::StoreStats(), which is where
+  // we got this code.
+  // tanh_deriv is the function derivative of the tanh function,
+  // tanh'(x) = tanh(x) * (1.0 - tanh(x)).  h_t corresponds to tanh(x).
+  CuMatrix<BaseFloat> tanh_deriv(h_t);
+  tanh_deriv.ApplyPow(2.0);
+  tanh_deriv.Scale(-1.0);
+  tanh_deriv.Add(1.0);
+
+  count_ += h_t.NumRows();
+  CuVector<BaseFloat> temp(cell_dim_);
+  temp.AddRowSumMat(1.0, h_t, 0.0);
+  value_sum_.AddVec(1.0, temp);
+  temp.AddRowSumMat(1.0, tanh_deriv, 0.0);
+  deriv_sum_.AddVec(1.0, temp);
+
+  if (count_ <= 0.0) {
+    // this would be rather pathological if it happened.
+    return;
+  }
+
+  // The rest of this function contains code modified from
+  // TanhComponent::RepairGradients().
+
+  // thresholds_vec is actually a 1-row matrix.  (the ApplyHeaviside
+  // function isn't defined for vectors).
+  CuMatrix<BaseFloat> thresholds(1, cell_dim_);
+  CuSubVector<BaseFloat> thresholds_vec(thresholds, 0);
+  thresholds_vec.AddVec(-1.0, deriv_sum_);
+  thresholds_vec.Add(self_repair_threshold_ * count_);
+  thresholds.ApplyHeaviside();
+  self_repair_total_ += thresholds_vec.Sum();
+
+  // there is a comment explaining what we are doing with
+  // 'thresholds_vec', at this point in TanhComponent::RepairGradients().
+  // We won't repeat it here.
+
+  h_t_deriv->AddMatDiagVec(-self_repair_scale_ / repair_and_stats_probability,
+                           h_t, kNoTrans, thresholds_vec);
+}
+
+void GruNonlinearityComponent::UpdateParameters(
+    const CuMatrixBase<BaseFloat> &sdotr,
+    const CuMatrixBase<BaseFloat> &h_t_deriv) {
+  if (is_gradient_) {
+    // 'simple' update, no natural gradient.  Compare
+    // with AffineComponent::UpdateSimple().
+    w_h_.AddMatMat(learning_rate_, h_t_deriv, kTrans,
+                   sdotr, kNoTrans, 1.0);
+  } else {
+    // the natural-gradient update.
+    CuMatrix<BaseFloat> in_value_temp(sdotr),
+        out_deriv_temp(h_t_deriv);
+
+    // These "scale" values get will get multiplied into the learning rate.
+    BaseFloat in_scale, out_scale;
+
+    preconditioner_in_.PreconditionDirections(&in_value_temp, &in_scale);
+    preconditioner_out_.PreconditionDirections(&out_deriv_temp, &out_scale);
+
+    BaseFloat local_lrate = learning_rate_ * in_scale * out_scale;
+    w_h_.AddMatMat(local_lrate, out_deriv_temp, kTrans,
+                   in_value_temp, kNoTrans, 1.0);
+  }
+}
+
+
+
+void GruNonlinearityComponent::Read(std::istream &is, bool binary) {
+  ReadUpdatableCommon(is, binary);
+  ExpectToken(is, binary, "<CellDim>");
+  ReadBasicType(is, binary, &cell_dim_);
+  ExpectToken(is, binary, "<RecurrentDim>");
+  ReadBasicType(is, binary, &recurrent_dim_);
+  ExpectToken(is, binary, "<w_h>");
+  w_h_.Read(is, binary);
+  ExpectToken(is, binary, "<ValueAvg>");
+  value_sum_.Read(is, binary);
+  ExpectToken(is, binary, "<DerivAvg>");
+  deriv_sum_.Read(is, binary);
+  ExpectToken(is, binary, "<SelfRepairTotal>");
+  ReadBasicType(is, binary, &self_repair_total_);
+  ExpectToken(is, binary, "<Count>");
+  ReadBasicType(is, binary, &count_);
+  value_sum_.Scale(count_);  // we read in the averages, not the sums.
+  deriv_sum_.Scale(count_);
+  ExpectToken(is, binary, "<SelfRepairThreshold>");
+  ReadBasicType(is, binary, &self_repair_threshold_);
+  ExpectToken(is, binary, "<SelfRepairScale>");
+  ReadBasicType(is, binary, &self_repair_scale_);
+  BaseFloat alpha;
+  int32 rank_in, rank_out, update_period;
+  ExpectToken(is, binary, "<Alpha>");
+  ReadBasicType(is, binary, &alpha);
+  ExpectToken(is, binary, "<RankInOut>");
+  ReadBasicType(is, binary, &rank_in);
+  ReadBasicType(is, binary, &rank_out);
+  ExpectToken(is, binary, "<UpdatePeriod>");
+  ReadBasicType(is, binary, &update_period);
+  preconditioner_in_.SetRank(rank_in);
+  preconditioner_out_.SetRank(rank_out);
+  preconditioner_in_.SetAlpha(alpha);
+  preconditioner_out_.SetAlpha(alpha);
+  preconditioner_in_.SetUpdatePeriod(update_period);
+  preconditioner_out_.SetUpdatePeriod(update_period);
+  ExpectToken(is, binary, "</GruNonlinearityComponent>");
+}
+
+void GruNonlinearityComponent::Write(std::ostream &os, bool binary) const {
+  WriteUpdatableCommon(os, binary);
+  WriteToken(os, binary, "<CellDim>");
+  WriteBasicType(os, binary, cell_dim_);
+  WriteToken(os, binary, "<RecurrentDim>");
+  WriteBasicType(os, binary, recurrent_dim_);
+  WriteToken(os, binary, "<w_h>");
+  w_h_.Write(os, binary);
+  {
+    // Write the value and derivative stats in a count-normalized way, for
+    // greater readability in text form.
+    WriteToken(os, binary, "<ValueAvg>");
+    Vector<BaseFloat> temp(value_sum_);
+    if (count_ != 0.0) temp.Scale(1.0 / count_);
+    temp.Write(os, binary);
+    WriteToken(os, binary, "<DerivAvg>");
+    temp.CopyFromVec(deriv_sum_);
+    if (count_ != 0.0) temp.Scale(1.0 / count_);
+    temp.Write(os, binary);
+  }
+  WriteToken(os, binary, "<SelfRepairTotal>");
+  WriteBasicType(os, binary, self_repair_total_);
+  WriteToken(os, binary, "<Count>");
+  WriteBasicType(os, binary, count_);
+  WriteToken(os, binary, "<SelfRepairThreshold>");
+  WriteBasicType(os, binary, self_repair_threshold_);
+  WriteToken(os, binary, "<SelfRepairScale>");
+  WriteBasicType(os, binary, self_repair_scale_);
+
+  BaseFloat alpha = preconditioner_in_.GetAlpha();
+  int32 rank_in = preconditioner_in_.GetRank(),
+      rank_out = preconditioner_out_.GetRank(),
+      update_period = preconditioner_in_.GetUpdatePeriod();
+  WriteToken(os, binary, "<Alpha>");
+  WriteBasicType(os, binary, alpha);
+  WriteToken(os, binary, "<RankInOut>");
+  WriteBasicType(os, binary, rank_in);
+  WriteBasicType(os, binary, rank_out);
+  WriteToken(os, binary, "<UpdatePeriod>");
+  WriteBasicType(os, binary, update_period);
+  WriteToken(os, binary, "</GruNonlinearityComponent>");
+}
+
+void GruNonlinearityComponent::Scale(BaseFloat scale) {
+  if (scale == 0.0) {
+    w_h_.SetZero();
+    value_sum_.SetZero();
+    deriv_sum_.SetZero();
+    self_repair_total_ = 0.0;
+    count_ = 0.0;
+  } else {
+    w_h_.Scale(scale);
+    value_sum_.Scale(scale);
+    deriv_sum_.Scale(scale);
+    self_repair_total_ *= scale;
+    count_ *= scale;
+  }
+}
+
+void GruNonlinearityComponent::Add(BaseFloat alpha,
+                                   const Component &other_in) {
+  const GruNonlinearityComponent *other =
+      dynamic_cast<const GruNonlinearityComponent*>(&other_in);
+  KALDI_ASSERT(other != NULL);
+  w_h_.AddMat(alpha, other->w_h_);
+  value_sum_.AddVec(alpha, other->value_sum_);
+  deriv_sum_.AddVec(alpha, other->deriv_sum_);
+  self_repair_total_ += alpha * other->self_repair_total_;
+  count_ += alpha * other->count_;
+}
+
+void GruNonlinearityComponent::ZeroStats() {
+  value_sum_.SetZero();
+  deriv_sum_.SetZero();
+  self_repair_total_ = 0.0;
+  count_ = 0.0;
+}
+
+void GruNonlinearityComponent::Check() const {
+  KALDI_ASSERT(cell_dim_ > 0 && recurrent_dim_ > 0 &&
+               recurrent_dim_ <= cell_dim_ &&
+               self_repair_threshold_ >= 0.0 &&
+               self_repair_scale_ >= 0.0 );
+  KALDI_ASSERT(w_h_.NumRows() == cell_dim_ &&
+               w_h_.NumCols() == recurrent_dim_);
+  KALDI_ASSERT(value_sum_.Dim() == cell_dim_ &&
+               deriv_sum_.Dim() == cell_dim_);
+}
+
+void GruNonlinearityComponent::PerturbParams(BaseFloat stddev) {
+  CuMatrix<BaseFloat> temp_params(w_h_.NumRows(), w_h_.NumCols());
+  temp_params.SetRandn();
+  w_h_.AddMat(stddev, temp_params);
+}
+
+BaseFloat GruNonlinearityComponent::DotProduct(
+    const UpdatableComponent &other_in) const {
+  const GruNonlinearityComponent *other =
+      dynamic_cast<const GruNonlinearityComponent*>(&other_in);
+  KALDI_ASSERT(other != NULL);
+  return TraceMatMat(w_h_, other->w_h_, kTrans);
+}
+
+int32 GruNonlinearityComponent::NumParameters() const {
+  return w_h_.NumRows() * w_h_.NumCols();
+}
+
+void GruNonlinearityComponent::Vectorize(VectorBase<BaseFloat> *params) const {
+  KALDI_ASSERT(params->Dim() == NumParameters());
+  params->CopyRowsFromMat(w_h_);
+}
+
+
+void GruNonlinearityComponent::UnVectorize(
+    const VectorBase<BaseFloat> &params)  {
+  KALDI_ASSERT(params.Dim() == NumParameters());
+  w_h_.CopyRowsFromVec(params);
+}
+
+void GruNonlinearityComponent::FreezeNaturalGradient(bool freeze) {
+  preconditioner_in_.Freeze(freeze);
+  preconditioner_out_.Freeze(freeze);
+}
+
+GruNonlinearityComponent::GruNonlinearityComponent(
+    const GruNonlinearityComponent &other):
+    UpdatableComponent(other),
+    cell_dim_(other.cell_dim_),
+    recurrent_dim_(other.recurrent_dim_),
+    w_h_(other.w_h_),
+    value_sum_(other.value_sum_),
+    deriv_sum_(other.deriv_sum_),
+    self_repair_total_(other.self_repair_total_),
+    count_(other.count_),
+    self_repair_threshold_(other.self_repair_threshold_),
+    self_repair_scale_(other.self_repair_scale_),
+    preconditioner_in_(other.preconditioner_in_),
+    preconditioner_out_(other.preconditioner_out_) {
+  Check();
+}
+
+
+int32 OutputGruNonlinearityComponent::InputDim() const {
+  return 3 * cell_dim_;
+}
+
+int32 OutputGruNonlinearityComponent::OutputDim() const {
+  return 2 * cell_dim_;
+}
+
+
+std::string OutputGruNonlinearityComponent::Info() const {
+  std::ostringstream stream;
+  stream << UpdatableComponent::Info()
+         << ", cell-dim=" << cell_dim_;
+  PrintParameterStats(stream, "w_h", w_h_);
+  stream << ", self-repair-threshold=" << self_repair_threshold_
+         << ", self-repair-scale=" << self_repair_scale_;
+  if (count_ > 0) {  // c.f. NonlinearComponent::Info().
+    stream << ", count=" << std::setprecision(3) << count_
+           << std::setprecision(6);
+    stream << ", self-repaired-proportion="
+           << (self_repair_total_ / (count_ * cell_dim_));
+    Vector<double> value_avg_dbl(value_sum_);
+    Vector<BaseFloat> value_avg(value_avg_dbl);
+    value_avg.Scale(1.0 / count_);
+    stream << ", value-avg=" << SummarizeVector(value_avg);
+    Vector<double> deriv_avg_dbl(deriv_sum_);
+    Vector<BaseFloat> deriv_avg(deriv_avg_dbl);
+    deriv_avg.Scale(1.0 / count_);
+    stream << ", deriv-avg=" << SummarizeVector(deriv_avg);
+  }
+  // natural-gradient parameters.
+  stream << ", alpha=" << preconditioner_.GetAlpha()
+         << ", rank=" << preconditioner_.GetRank()
+         << ", update-period="
+         << preconditioner_.GetUpdatePeriod();
+  return stream.str();
+}
+
+void OutputGruNonlinearityComponent::InitFromConfig(ConfigLine *cfl) {
+  cell_dim_ = -1;
+  self_repair_threshold_ = 0.2;
+  self_repair_scale_ = 1.0e-05;
+
+  InitLearningRatesFromConfig(cfl);
+  if (!cfl->GetValue("cell-dim", &cell_dim_) || cell_dim_ <= 0)
+    KALDI_ERR << "cell-dim > 0 is required for GruNonlinearityComponent.";
+
+  BaseFloat param_mean = 0.0, param_stddev = 1.0, 
+      alpha = 4.0;
+  int32 rank=8,
+      update_period = 10;
+
+  cfl->GetValue("self-repair-threshold", &self_repair_threshold_);
+  cfl->GetValue("self-repair-scale", &self_repair_scale_);
+  cfl->GetValue("param-mean", &param_mean);
+  cfl->GetValue("param-stddev", &param_stddev);
+  cfl->GetValue("alpha", &alpha);
+  cfl->GetValue("rank", &rank);
+  cfl->GetValue("update-period", &update_period);
+
+
+  w_h_.Resize(cell_dim_);
+  w_h_.SetRandn();
+  w_h_.Scale(param_stddev);
+  w_h_.Add(param_mean);
+
+  preconditioner_.SetAlpha(alpha);
+  preconditioner_.SetRank(rank);
+  preconditioner_.SetUpdatePeriod(update_period);
+
+  count_ = 0.0;
+  self_repair_total_ = 0.0;
+  value_sum_.Resize(cell_dim_);
+  deriv_sum_.Resize(cell_dim_);
+
+  Check();
+}
+
+void* OutputGruNonlinearityComponent::Propagate(
+    const ComponentPrecomputedIndexes *indexes,
+    const CuMatrixBase<BaseFloat> &in,
+    CuMatrixBase<BaseFloat> *out) const {
+  KALDI_ASSERT(in.NumRows() == out->NumRows() &&
+               in.NumCols() == InputDim() &&
+               out->NumCols() == OutputDim());
+  // This component implements the function
+  // (z_t, hpart_t, c_{t-1}) -> (h_t, c_t)
+  // of dimensions
+  // (cell_dim, cell_dim, cell_dim) -> (cell_dim, cell_dim),
+  // where:
+  // h_t = \tanh( hpart_t + W^h \dot c_{t-1} )
+  // c_t = (1 - z_t) \dot h_t + z_t \dot c_{t-1}.
+  int32 num_rows = in.NumRows(),
+      c = cell_dim_;
+  CuSubMatrix<BaseFloat> z_t(in, 0, num_rows, 0, c),
+      hpart_t(in, 0, num_rows, c, c),
+      c_t1(in, 0, num_rows, c + c, c);
+
+  CuSubMatrix<BaseFloat> h_t(*out, 0, num_rows, 0, c),
+      c_t(*out, 0, num_rows, c, c);
+
+  h_t.CopyFromMat(c_t1);
+  // now h_t = c_{t-1}
+  h_t.MulColsVec(w_h_);
+  // now h_t = W^h \dot c_{t-1}
+  h_t.AddMat(1.0, hpart_t, kNoTrans);
+  // now h_t = hpart_t + W^h \dot c_{t-1}.(note: hpart_t actually means U^h x_t).
+  h_t.Tanh(h_t);
+  // now, h_t = tanh(hpart_t + W^h \dot c_{t-1}).
+
+  c_t.CopyFromMat(h_t);
+  // now c_t = h_t
+  c_t.AddMatMatElements(-1.0, z_t, h_t, 1.0);
+  // now c_t = (1 - z_t) \dot h_t.
+  c_t.AddMatMatElements(1.0, z_t, c_t1, 1.0);
+  // now c_t = (1 - z_t) \dot h_t  +  z_t \dot c_{t-1}.
+  return NULL;
+}
+
+void OutputGruNonlinearityComponent::Backprop(
+    const std::string &debug_info,
+    const ComponentPrecomputedIndexes *, // indexes
+    const CuMatrixBase<BaseFloat> &in_value,
+    const CuMatrixBase<BaseFloat> &out_value,
+    const CuMatrixBase<BaseFloat> &out_deriv,
+    void *memo,
+    Component *to_update_in,
+    CuMatrixBase<BaseFloat> *in_deriv) const {
+  KALDI_ASSERT(SameDim(out_value, out_deriv) &&
+               in_value.NumRows() == out_value.NumRows() &&
+               in_value.NumCols() == InputDim() &&
+               out_value.NumCols() == OutputDim() &&
+               (in_deriv == NULL || SameDim(in_value, *in_deriv)) &&
+               memo == NULL);
+  OutputGruNonlinearityComponent *to_update =
+      dynamic_cast<OutputGruNonlinearityComponent*>(to_update_in);
+  KALDI_ASSERT(in_deriv != NULL || to_update != NULL);
+  int32 num_rows = in_value.NumRows(),
+      c = cell_dim_;
+
+  // To understand what's going on here, compare this code with the
+  // corresponding 'forward' code in Propagate().
+
+
+  CuSubMatrix<BaseFloat> z_t(in_value, 0, num_rows, 0, c),
+      hpart_t(in_value, 0, num_rows, c, c),
+      c_t1(in_value, 0, num_rows, c + c, c);
+
+  // The purpose of this 'in_deriv_ptr' is so that we can create submatrices
+  // like z_t_deriv without the code crashing.  If in_deriv is NULL these point
+  // to 'in_value', and we'll be careful never to actually write to these
+  // sub-matrices, which aside from being conceptually wrong would violate the
+  // const semantics of this function.
+  const CuMatrixBase<BaseFloat> *in_deriv_ptr =
+      (in_deriv == NULL ? &in_value : in_deriv);
+  CuSubMatrix<BaseFloat> z_t_deriv(*in_deriv_ptr, 0, num_rows, 0, c),
+      hpart_t_deriv(*in_deriv_ptr, 0, num_rows, c, c),
+      c_t1_deriv(*in_deriv_ptr, 0, num_rows, c + c, c);
+
+  // Note: the output h_t is never actually used in the GRU computation (we only
+  // output it because we want the value to be cached to save computation in the
+  // backprop), so we expect that the 'h_t_deriv', if we extracted it in the
+  // obvious way, would be all zeros.
+  // We create a different, local h_t_deriv
+  // variable that backpropagates the derivative from c_t_deriv.
+  CuSubMatrix<BaseFloat> h_t(out_value, 0, num_rows, 0, c),
+      c_t(out_value, 0, num_rows, c, c),
+      c_t_deriv(out_deriv, 0, num_rows, c, c);
+  CuMatrix<BaseFloat> h_t_deriv(num_rows, c, kUndefined);
+
+  {  // we initialize h_t_deriv with the derivative from 'out_deriv'.
+    // In real life in a GRU, this would always be zero; but in testing
+    // code it may be nonzero and we include this term so that
+    // the tests don't fail.  Note: if you were to remove these
+    // lines, you'd have to change 'h_t_deriv.AddMat(1.0, c_t_deriv);' below
+    // to a CopyFromMat() call.
+    CuSubMatrix<BaseFloat> h_t_deriv_in(out_deriv, 0, num_rows, 0, c);
+    h_t_deriv.CopyFromMat(h_t_deriv_in);
+  }
+
+
+  { // This block does the
+    // backprop corresponding to the
+    // forward-pass expression: c_t = (1 - z_t) \dot h_t + z_t \dot c_{t-1}.
+
+    // First do: h_t_deriv = c_t_deriv \dot (1 - z_t).
+    h_t_deriv.AddMat(1.0, c_t_deriv);
+    h_t_deriv.AddMatMatElements(-1.0, c_t_deriv, z_t, 1.0);
+
+    if (in_deriv) {
+      // these should be self-explanatory if you study
+      // the expression "c_t = (1 - z_t) \dot h_t + z_t \dot c_{t-1}".
+      z_t_deriv.AddMatMatElements(-1.0, c_t_deriv, h_t, 1.0);
+      z_t_deriv.AddMatMatElements(1.0, c_t_deriv, c_t1, 1.0);
+      c_t1_deriv.AddMatMatElements(1.0, c_t_deriv, z_t, 1.0);
+    }
+  }
+
+  h_t_deriv.DiffTanh(h_t, h_t_deriv);
+  if (to_update)
+    to_update->TanhStatsAndSelfRepair(h_t, &h_t_deriv);
+  
+  if (to_update)
+    to_update->UpdateParameters(c_t1, h_t_deriv);
+  // At this point, 'h_t_deriv' contains the derivative w.r.t.
+  // the argument of the tanh function, i.e. w.r.t. the expression:
+  //    hpart_t + W^h \dot c_{t-1}.
+  // The next block propagates this to the derivative for h_part_t and c_t1
+  // The derivative of z_t has already been finished.
+  if (in_deriv) {
+    hpart_t_deriv.AddMat(1.0, h_t_deriv);
+
+    // Currently, c_t1_deriv contains the derivative from
+    // c_t = (1 - z_t) \dot h_t + z_t \dot c_{t-1}
+    // Now compute the h_t = \tanh(hpart_t + W^h \dot c_{t-1}) part
+    h_t_deriv.MulColsVec(w_h_);
+    // Combine the two parts
+    c_t1_deriv.AddMat(1.0, h_t_deriv);
+  }
+}
+
+
+void OutputGruNonlinearityComponent::TanhStatsAndSelfRepair(
+    const CuMatrixBase<BaseFloat> &h_t,
+    CuMatrixBase<BaseFloat> *h_t_deriv) {
+  KALDI_ASSERT(SameDim(h_t, *h_t_deriv));
+
+  // we use this probability (hardcoded for now) to limit the stats accumulation
+  // and self-repair code to running on about half of the minibatches.
+  BaseFloat repair_and_stats_probability = 0.5;
+  if (RandUniform() > repair_and_stats_probability)
+    return;
+
+  // OK, accumulate stats.
+  // For the next few lines, compare with TanhComponent::StoreStats(), which is where
+  // we got this code.
+  // tanh_deriv is the function derivative of the tanh function,
+  // tanh'(x) = tanh(x) * (1.0 - tanh(x)).  h_t corresponds to tanh(x).
+  CuMatrix<BaseFloat> tanh_deriv(h_t);
+  tanh_deriv.ApplyPow(2.0);
+  tanh_deriv.Scale(-1.0);
+  tanh_deriv.Add(1.0);
+
+  count_ += h_t.NumRows();
+  CuVector<BaseFloat> temp(cell_dim_);
+  temp.AddRowSumMat(1.0, h_t, 0.0);
+  value_sum_.AddVec(1.0, temp);
+  temp.AddRowSumMat(1.0, tanh_deriv, 0.0);
+  deriv_sum_.AddVec(1.0, temp);
+
+  if (count_ <= 0.0) {
+    // this would be rather pathological if it happened.
+    return;
+  }
+
+  // The rest of this function contains code modified from
+  // TanhComponent::RepairGradients().
+
+  // thresholds_vec is actually a 1-row matrix.  (the ApplyHeaviside
+  // function isn't defined for vectors).
+  CuMatrix<BaseFloat> thresholds(1, cell_dim_);
+  CuSubVector<BaseFloat> thresholds_vec(thresholds, 0);
+  thresholds_vec.AddVec(-1.0, deriv_sum_);
+  thresholds_vec.Add(self_repair_threshold_ * count_);
+  thresholds.ApplyHeaviside();
+  self_repair_total_ += thresholds_vec.Sum();
+
+  // there is a comment explaining what we are doing with
+  // 'thresholds_vec', at this point in TanhComponent::RepairGradients().
+  // We won't repeat it here.
+
+  h_t_deriv->AddMatDiagVec(-self_repair_scale_ / repair_and_stats_probability,
+                           h_t, kNoTrans, thresholds_vec);
+}
+
+void OutputGruNonlinearityComponent::UpdateParameters(
+    const CuMatrixBase<BaseFloat> &c_t1_value,
+    const CuMatrixBase<BaseFloat> &h_t_deriv) {
+  if (is_gradient_) {
+    // 'simple' update, no natural gradient.  Compare
+    // with PerElementScaleComponent::UpdateSimple().
+    w_h_.AddDiagMatMat(learning_rate_, h_t_deriv, kTrans,
+                       c_t1_value, kNoTrans, 1.0);
+  } else {
+    // the natural-gradient update.
+    CuMatrix<BaseFloat> derivs_per_frame(c_t1_value);
+    derivs_per_frame.MulElements(h_t_deriv);
+
+    // This "scale" value gets will get multiplied into the learning rate.
+    BaseFloat scale;
+
+    preconditioner_.PreconditionDirections(&derivs_per_frame, &scale);
+
+    CuVector<BaseFloat> delta_w_h(w_h_.Dim());
+    delta_w_h.AddRowSumMat(scale * learning_rate_, derivs_per_frame);
+    w_h_.AddVec(1.0, delta_w_h);
+  }
+}
+
+
+
+void OutputGruNonlinearityComponent::Read(std::istream &is, bool binary) {
+  ReadUpdatableCommon(is, binary);
+  ExpectToken(is, binary, "<CellDim>");
+  ReadBasicType(is, binary, &cell_dim_);
+  ExpectToken(is, binary, "<w_h>");
+  w_h_.Read(is, binary);
+  ExpectToken(is, binary, "<ValueAvg>");
+  value_sum_.Read(is, binary);
+  ExpectToken(is, binary, "<DerivAvg>");
+  deriv_sum_.Read(is, binary);
+  ExpectToken(is, binary, "<SelfRepairTotal>");
+  ReadBasicType(is, binary, &self_repair_total_);
+  ExpectToken(is, binary, "<Count>");
+  ReadBasicType(is, binary, &count_);
+  value_sum_.Scale(count_);  // we read in the averages, not the sums.
+  deriv_sum_.Scale(count_);
+  ExpectToken(is, binary, "<SelfRepairThreshold>");
+  ReadBasicType(is, binary, &self_repair_threshold_);
+  ExpectToken(is, binary, "<SelfRepairScale>");
+  ReadBasicType(is, binary, &self_repair_scale_);
+  BaseFloat alpha;
+  int32 rank, update_period;
+  ExpectToken(is, binary, "<Alpha>");
+  ReadBasicType(is, binary, &alpha);
+  ExpectToken(is, binary, "<Rank>");
+  ReadBasicType(is, binary, &rank);
+  ExpectToken(is, binary, "<UpdatePeriod>");
+  ReadBasicType(is, binary, &update_period);
+  preconditioner_.SetRank(rank);
+  preconditioner_.SetAlpha(alpha);
+  preconditioner_.SetUpdatePeriod(update_period);
+  ExpectToken(is, binary, "</OutputGruNonlinearityComponent>");
+}
+
+void OutputGruNonlinearityComponent::Write(std::ostream &os, bool binary) const {
+  WriteUpdatableCommon(os, binary);
+  WriteToken(os, binary, "<CellDim>");
+  WriteBasicType(os, binary, cell_dim_);
+  WriteToken(os, binary, "<w_h>");
+  w_h_.Write(os, binary);
+  {
+    // Write the value and derivative stats in a count-normalized way, for
+    // greater readability in text form.
+    WriteToken(os, binary, "<ValueAvg>");
+    Vector<BaseFloat> temp(value_sum_);
+    if (count_ != 0.0) temp.Scale(1.0 / count_);
+    temp.Write(os, binary);
+    WriteToken(os, binary, "<DerivAvg>");
+    temp.CopyFromVec(deriv_sum_);
+    if (count_ != 0.0) temp.Scale(1.0 / count_);
+    temp.Write(os, binary);
+  }
+  WriteToken(os, binary, "<SelfRepairTotal>");
+  WriteBasicType(os, binary, self_repair_total_);
+  WriteToken(os, binary, "<Count>");
+  WriteBasicType(os, binary, count_);
+  WriteToken(os, binary, "<SelfRepairThreshold>");
+  WriteBasicType(os, binary, self_repair_threshold_);
+  WriteToken(os, binary, "<SelfRepairScale>");
+  WriteBasicType(os, binary, self_repair_scale_);
+
+  BaseFloat alpha = preconditioner_.GetAlpha();
+  int32 rank = preconditioner_.GetRank(),
+      update_period = preconditioner_.GetUpdatePeriod();
+  WriteToken(os, binary, "<Alpha>");
+  WriteBasicType(os, binary, alpha);
+  WriteToken(os, binary, "<Rank>");
+  WriteBasicType(os, binary, rank);
+  WriteToken(os, binary, "<UpdatePeriod>");
+  WriteBasicType(os, binary, update_period);
+  WriteToken(os, binary, "</OutputGruNonlinearityComponent>");
+}
+
+void OutputGruNonlinearityComponent::Scale(BaseFloat scale) {
+  if (scale == 0.0) {
+    w_h_.SetZero();
+    value_sum_.SetZero();
+    deriv_sum_.SetZero();
+    self_repair_total_ = 0.0;
+    count_ = 0.0;
+  } else {
+    w_h_.Scale(scale);
+    value_sum_.Scale(scale);
+    deriv_sum_.Scale(scale);
+    self_repair_total_ *= scale;
+    count_ *= scale;
+  }
+}
+
+void OutputGruNonlinearityComponent::Add(BaseFloat alpha,
+                                   const Component &other_in) {
+  const OutputGruNonlinearityComponent *other =
+      dynamic_cast<const OutputGruNonlinearityComponent*>(&other_in);
+  KALDI_ASSERT(other != NULL);
+  w_h_.AddVec(alpha, other->w_h_);
+  value_sum_.AddVec(alpha, other->value_sum_);
+  deriv_sum_.AddVec(alpha, other->deriv_sum_);
+  self_repair_total_ += alpha * other->self_repair_total_;
+  count_ += alpha * other->count_;
+}
+
+void OutputGruNonlinearityComponent::ZeroStats() {
+  value_sum_.SetZero();
+  deriv_sum_.SetZero();
+  self_repair_total_ = 0.0;
+  count_ = 0.0;
+}
+
+void OutputGruNonlinearityComponent::Check() const {
+  KALDI_ASSERT(cell_dim_ > 0 &&
+               self_repair_threshold_ >= 0.0 &&
+               self_repair_scale_ >= 0.0 );
+  KALDI_ASSERT(w_h_.Dim() == cell_dim_);
+  KALDI_ASSERT(value_sum_.Dim() == cell_dim_ &&
+               deriv_sum_.Dim() == cell_dim_);
+}
+
+void OutputGruNonlinearityComponent::PerturbParams(BaseFloat stddev) {
+  CuVector<BaseFloat> temp_params(w_h_.Dim());
+  temp_params.SetRandn();
+  w_h_.AddVec(stddev, temp_params);
+}
+
+BaseFloat OutputGruNonlinearityComponent::DotProduct(
+    const UpdatableComponent &other_in) const {
+  const OutputGruNonlinearityComponent *other =
+      dynamic_cast<const OutputGruNonlinearityComponent*>(&other_in);
+  KALDI_ASSERT(other != NULL);
+  return VecVec(w_h_, other->w_h_);
+}
+
+int32 OutputGruNonlinearityComponent::NumParameters() const {
+  return w_h_.Dim();
+}
+
+void OutputGruNonlinearityComponent::Vectorize(VectorBase<BaseFloat> *params) const {
+  KALDI_ASSERT(params->Dim() == NumParameters());
+  params->CopyFromVec(w_h_);
+}
+
+
+void OutputGruNonlinearityComponent::UnVectorize(
+    const VectorBase<BaseFloat> &params)  {
+  KALDI_ASSERT(params.Dim() == NumParameters());
+  w_h_.CopyFromVec(params);
+}
+
+void OutputGruNonlinearityComponent::FreezeNaturalGradient(bool freeze) {
+  preconditioner_.Freeze(freeze);
+}
+
+OutputGruNonlinearityComponent::OutputGruNonlinearityComponent(
+    const OutputGruNonlinearityComponent &other):
+    UpdatableComponent(other),
+    cell_dim_(other.cell_dim_),
+    w_h_(other.w_h_),
+    value_sum_(other.value_sum_),
+    deriv_sum_(other.deriv_sum_),
+    self_repair_total_(other.self_repair_total_),
+    count_(other.count_),
+    self_repair_threshold_(other.self_repair_threshold_),
+    self_repair_scale_(other.self_repair_scale_),
+    preconditioner_(other.preconditioner_) {
+  Check();
+}
+
+} // namespace nnet3
+} // namespace kaldi
diff --git a/src/nnet3/nnet-combined-component.h b/src/nnet3/nnet-combined-component.h
new file mode 100644
index 00000000000..85011bd826d
--- /dev/null
+++ b/src/nnet3/nnet-combined-component.h
@@ -0,0 +1,1109 @@
+// nnet3/nnet-combined-component.h
+
+// Copyright      2018  Johns Hopkins University (author: Daniel Povey)
+//                2018  Hang Lyu
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_NNET3_NNET_SPECIAL_COMPONENT_H_
+#define KALDI_NNET3_NNET_SPECIAL_COMPONENT_H_
+
+#include "nnet3/nnet-common.h"
+#include "nnet3/nnet-component-itf.h"
+#include "nnet3/natural-gradient-online.h"
+#include <iostream>
+
+namespace kaldi {
+namespace nnet3 {
+
+/// @file  nnet-combined-component.h
+///   You can view this as an overflow from nnet-simple-component.h.
+///   It contains components which meet the definition of "simple"
+///   components, i.e. they set the kSimpleComponent flag, but
+///   which are more special-purpose, i.e. they are specific to
+///   special layer types such as LSTMs, CNNs and GRUs.
+
+
+
+/**
+ * WARNING, this component is deprecated in favor of
+ *  TimeHeightConvolutionComponent, and will be deleted.
+ * ConvolutionalComponent implements 2d-convolution.
+ * It uses 3D filters on 3D inputs, but the 3D filters hop only over
+ * 2 dimensions as it has same size as the input along the 3rd dimension.
+ * Input : A matrix where each row is a  vectorized 3D-tensor.
+ *        The 3D tensor has dimensions
+ *        x: (e.g. time)
+ *        y: (e.g. frequency)
+ *        z: (e.g. channels like features/delta/delta-delta)
+ *
+ *        The component supports input vectorizations of type zyx and yzx.
+ *        The default vectorization type is zyx.
+ *        e.g. for input vectorization of type zyx the input is vectorized by
+ *        spanning axes z, y and x of the tensor in that order.
+ *        Given 3d tensor A with sizes (2, 2, 2) along the three dimensions
+ *        the zyx vectorized input looks like
+ *  A(0,0,0) A(0,0,1) A(0,1,0) A(0,1,1) A(1,0,0) A(1,0,1) A(1,1,0) A(1,1,1)
+ *
+ *
+ * Output : The output is also a 3D tensor vectorized in the zyx format.
+ *          The channel axis (z) in the output corresponds to the output of
+ *          different filters. The first channel corresponds to the first filter
+ *          i.e., first row of the filter_params_ matrix.
+ *
+ * Note: The component has to support yzx input vectorization as the binaries
+ * like add-deltas generate yz vectorized output. These input vectors are
+ * concatenated using the Append descriptor across time steps to form a yzx
+ * vectorized 3D tensor input.
+ * e.g. Append(Offset(input, -1), input, Offset(input, 1))
+ *
+ *
+ * For information on the hyperparameters and parameters of this component see
+ * the variable declarations.
+ *
+ * Propagation:
+ * ------------
+ * Convolution operation consists of a dot-products between the filter tensor
+ * and input tensor patch, for various shifts of filter tensor along the x and y
+ * axes input tensor. (Note: there is no shift along z-axis as the filter and
+ * input tensor have same size along this axis).
+ *
+ * For a particular shift (i,j) of the filter tensor
+ * along input tensor dimensions x and y, the elements of the input tensor which
+ * overlap with the filter form the input tensor patch. This patch is vectorized
+ * in zyx format. All the patches corresponding to various samples in the
+ * mini-batch are stacked into a matrix, where each row corresponds to one
+ * patch. Let this matrix be represented by X_{i,j}. The dot products with
+ * various filters are computed simultaneously by computing the matrix product
+ * with the filter_params_ matrix (W)
+ * Y_{i,j} = X_{i,j}*W^T.
+ * Each row of W corresponds to one filter 3D tensor vectorized in zyx format.
+ *
+ * All the matrix products corresponding to various shifts (i,j) of the
+ * filter tensor are computed simultaneously using the AddMatMatBatched
+ * call of CuMatrixBase class.
+ *
+ * BackPropagation:
+ * ----------------
+ *  Backpropagation to compute the input derivative (\nabla X_{i,j})
+ *  consists of the a series of matrix products.
+ *  \nablaX_{i,j} = \nablaY_{i,j}*W where \nablaY_{i,j} corresponds to the
+ *   output derivative for a particular shift of the filter.
+ *
+ *   Once again these matrix products are computed simultaneously.
+ *
+ * Update:
+ * -------
+ *  The weight gradient is computed as
+ *  \nablaW = \Sum_{i,j} (X_{i,j}^T *\nablaY_{i,j})
+ *
+ */
+class ConvolutionComponent: public UpdatableComponent {
+ public:
+  enum TensorVectorizationType  {
+    kYzx = 0,
+    kZyx = 1
+  };
+
+  ConvolutionComponent();
+  // constructor using another component
+  ConvolutionComponent(const ConvolutionComponent &component);
+  // constructor using parameters
+  ConvolutionComponent(
+    const CuMatrixBase<BaseFloat> &filter_params,
+    const CuVectorBase<BaseFloat> &bias_params,
+    int32 input_x_dim, int32 input_y_dim, int32 input_z_dim,
+    int32 filt_x_dim, int32 filt_y_dim,
+    int32 filt_x_step, int32 filt_y_step,
+    TensorVectorizationType input_vectorization,
+    BaseFloat learning_rate);
+
+  virtual int32 InputDim() const;
+  virtual int32 OutputDim() const;
+
+  virtual std::string Info() const;
+  virtual void InitFromConfig(ConfigLine *cfl);
+  virtual std::string Type() const { return "ConvolutionComponent"; }
+  virtual int32 Properties() const {
+    return kSimpleComponent|kUpdatableComponent|kBackpropNeedsInput|
+           kBackpropAdds|kPropagateAdds;
+  }
+
+  virtual void* Propagate(const ComponentPrecomputedIndexes *indexes,
+                         const CuMatrixBase<BaseFloat> &in,
+                         CuMatrixBase<BaseFloat> *out) const;
+  virtual void Backprop(const std::string &debug_info,
+                        const ComponentPrecomputedIndexes *indexes,
+                        const CuMatrixBase<BaseFloat> &in_value,
+                        const CuMatrixBase<BaseFloat> &, // out_value,
+                        const CuMatrixBase<BaseFloat> &out_deriv,
+                        void *memo,
+                        Component *to_update_in,
+                        CuMatrixBase<BaseFloat> *in_deriv) const;
+  void Update(const std::string &debug_info,
+              const CuMatrixBase<BaseFloat> &in_value,
+              const CuMatrixBase<BaseFloat> &out_deriv,
+              const std::vector<CuSubMatrix<BaseFloat> *>& out_deriv_batch);
+
+
+  virtual void Read(std::istream &is, bool binary);
+  virtual void Write(std::ostream &os, bool binary) const;
+
+  virtual Component* Copy() const;
+
+  // Some functions from base-class UpdatableComponent.
+  virtual void Scale(BaseFloat scale);
+  virtual void Add(BaseFloat alpha, const Component &other);
+  virtual void PerturbParams(BaseFloat stddev);
+  virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
+  virtual int32 NumParameters() const;
+  virtual void Vectorize(VectorBase<BaseFloat> *params) const;
+  virtual void UnVectorize(const VectorBase<BaseFloat> &params);
+
+  // Some functions that are specific to this class.
+  void SetParams(const VectorBase<BaseFloat> &bias,
+                 const MatrixBase<BaseFloat> &filter);
+  const CuVector<BaseFloat> &BiasParams() const { return bias_params_; }
+  const CuMatrix<BaseFloat> &LinearParams() const { return filter_params_; }
+  void Init(int32 input_x_dim, int32 input_y_dim, int32 input_z_dim,
+            int32 filt_x_dim, int32 filt_y_dim,
+            int32 filt_x_step, int32 filt_y_step, int32 num_filters,
+            TensorVectorizationType input_vectorization,
+            BaseFloat param_stddev, BaseFloat bias_stddev);
+  // there is no filt_z_dim parameter as the length of the filter along
+  // z-dimension is same as the input
+  void Init(int32 input_x_dim, int32 input_y_dim, int32 input_z_dim,
+            int32 filt_x_dim, int32 filt_y_dim,
+            int32 filt_x_step, int32 filt_y_step,
+            TensorVectorizationType input_vectorization,
+            std::string matrix_filename);
+
+  // resize the component, setting the parameters to zero, while
+  // leaving any other configuration values the same
+  void Resize(int32 input_dim, int32 output_dim);
+
+  void Update(const std::string &debug_info,
+              const CuMatrixBase<BaseFloat> &in_value,
+              const CuMatrixBase<BaseFloat> &out_deriv);
+
+
+ private:
+  int32 input_x_dim_;   // size of the input along x-axis
+                        // (e.g. number of time steps)
+
+  int32 input_y_dim_;   // size of input along y-axis
+                        // (e.g. number of mel-frequency bins)
+
+  int32 input_z_dim_;   // size of input along z-axis
+                        // (e.g. number of channels is 3 if the input has
+                        // features + delta + delta-delta features
+
+  int32 filt_x_dim_;    // size of the filter along x-axis
+
+  int32 filt_y_dim_;    // size of the filter along y-axis
+
+  // there is no filt_z_dim_ as it is always assumed to be
+  // the same as input_z_dim_
+
+  int32 filt_x_step_;   // the number of steps taken along x-axis of input
+                        //  before computing the next dot-product
+                        //  of filter and input
+
+  int32 filt_y_step_;   // the number of steps taken along y-axis of input
+                        // before computing the next dot-product of the filter
+                        // and input
+
+  // there is no filt_z_step_ as only dot product is possible along this axis
+
+  TensorVectorizationType input_vectorization_; // type of vectorization of the
+  // input 3D tensor. Accepts zyx and yzx formats
+
+  CuMatrix<BaseFloat> filter_params_;
+  // the filter (or kernel) matrix is a matrix of vectorized 3D filters
+  // where each row in the matrix corresponds to one filter.
+  // The 3D filter tensor is vectorizedin zyx format.
+  // The first row of the matrix corresponds to the first filter and so on.
+  // Keep in mind the vectorization type and order of filters when using file
+  // based initialization.
+
+  CuVector<BaseFloat> bias_params_;
+  // the filter-specific bias vector (i.e., there is a seperate bias added
+  // to the output of each filter).
+
+  void InputToInputPatches(const CuMatrixBase<BaseFloat>& in,
+                           CuMatrix<BaseFloat> *patches) const;
+  void InderivPatchesToInderiv(const CuMatrix<BaseFloat>& in_deriv_patches,
+                               CuMatrixBase<BaseFloat> *in_deriv) const;
+  const ConvolutionComponent &operator = (const ConvolutionComponent &other); // Disallow.
+};
+
+
+/*
+  LstmNonlinearityComponent is a component that implements part of an LSTM, by
+  combining together the sigmoids and tanh's, plus some diagonal terms, into
+  a single block.
+  We will refer to the LSTM formulation used in
+
+  Long Short-Term Memory Recurrent Neural Network Architectures for Large Scale Acoustic Modeling"
+  by H. Sak et al,
+  http://static.googleusercontent.com/media/research.google.com/en//pubs/archive/43905.pdf.
+
+  Suppose the cell dimension is C.  Then outside this component, we compute
+  the 4 * C-dimensional quantity consisting of 4 blocks as follows, by a single
+  matrix multiplication:
+
+  i_part = W_{ix} x_t + W_{im} m_{t-1} + b_i
+  f_part = W_{fx} x_t + W_{fm} m_{t-1} + b_f
+  c_part = W_{cx} x_t + W_{cm} m_{t-1} + b_c
+  o_part = W_{cx} x_t + W_{om} m_{t-1} + b_o
+
+  The part of the computation that takes place in this component is as follows.
+  Its input is of dimension 5C [however, search for 'dropout' below],
+  consisting of 5 blocks: (i_part, f_part, c_part, o_part, and c_{t-1}).  Its
+  output is of dimension 2C, consisting of 2 blocks: c_t and m_t.
+
+  To recap: the input is (i_part, f_part, c_part, o_part, c_{t-1}); the output is (c_t, m_t).
+
+  This component has parameters, 3C of them in total: the diagonal matrices w_i, w_f
+  and w_o.
+
+
+  In the forward pass (Propagate), this component computes the following:
+
+     i_t = Sigmoid(i_part + w_{ic}*c_{t-1})   (1)
+     f_t = Sigmoid(f_part + w_{fc}*c_{t-1})   (2)
+     c_t = f_t*c_{t-1} + i_t * Tanh(c_part)   (3)
+     o_t = Sigmoid(o_part + w_{oc}*c_t)       (4)
+     m_t = o_t * Tanh(c_t)                    (5)
+    # note: the outputs are just c_t and m_t.
+
+  [Note regarding dropout: optionally the input-dimension may be 5C + 3 instead
+  of 5C in this case, the last three input dimensions will be interpreted as
+  per-frame dropout masks on i_t, f_t and o_t respectively, so that on the RHS of
+  (3), i_t is replaced by i_t * i_t_scale, and likewise for f_t and o_t.]
+
+  The backprop is as you would think, but for the "self-repair" we need to pass
+  in additional vectors (of the same dim as the parameters of the layer) that
+  dictate whether or not we add an additional term to the backpropagated
+  derivatives.  (This term helps force the input to the nonlinearities into the
+  range where the derivatives are not too small).
+
+  This component stores stats of the same form as are normally stored by the
+  StoreStats() functions for the sigmoid and tanh units, i.e. averages of the
+  activations and derivatives, but this is done inside the Backprop() functions.
+  [the StoreStats() functions don't take the input data as an argument, so
+  storing this data that way is impossible, and anyway it's more efficient to
+  do it as part of backprop.]
+
+  Configuration values accepted:
+         cell-dim          e.g. cell-dim=1024  Cell dimension.  The input
+                          dimension of this component is cell-dim * 5, and the
+                          output dimension is cell-dim * 2.  Note: this
+                          component implements only part of the LSTM layer,
+                          see comments above.
+         param-stddev     Standard deviation for random initialization of
+                          the diagonal matrices (AKA peephole connections).
+                          default=1.0, which is probably too high but
+                          we couldn't see any reliable gain from decreasing it.
+         tanh-self-repair-threshold   Equivalent to the self-repair-lower-threshold
+                          in a TanhComponent; applies to both the tanh nonlinearities.
+                          default=0.2, you probably won't want to changethis.
+         sigmoid-self-repair-threshold   Equivalent to self-repair-lower-threshold
+                          in a SigmoidComponent; applies to all three of the sigmoid
+                          nonlinearities.  default=0.05, you probably won't want to
+                          change this.
+         self-repair-scale Equivalent to the self-repair-scale in a SigmoidComponent
+                          or TanhComponent; applies to both the sigmoid and tanh
+                          nonlinearities.  default=1.0e-05, which you probably won't
+                          want to change unless dealing with an objective function
+                          that has smaller or larger dynamic range than normal, in
+                          which case you might want to make it smaller or larger.
+*/
+class LstmNonlinearityComponent: public UpdatableComponent {
+ public:
+
+  virtual int32 InputDim() const;
+  virtual int32 OutputDim() const;
+  virtual std::string Info() const;
+  virtual void InitFromConfig(ConfigLine *cfl);
+  LstmNonlinearityComponent(): use_dropout_(false) { }
+  virtual std::string Type() const { return "LstmNonlinearityComponent"; }
+  virtual int32 Properties() const {
+    return kSimpleComponent|kUpdatableComponent|kBackpropNeedsInput;
+  }
+
+  virtual void* Propagate(const ComponentPrecomputedIndexes *indexes,
+                         const CuMatrixBase<BaseFloat> &in,
+                         CuMatrixBase<BaseFloat> *out) const;
+  virtual void Backprop(const std::string &debug_info,
+                        const ComponentPrecomputedIndexes *indexes,
+                        const CuMatrixBase<BaseFloat> &in_value,
+                        const CuMatrixBase<BaseFloat> &, // out_value,
+                        const CuMatrixBase<BaseFloat> &out_deriv,
+                        void *memo,
+                        Component *to_update_in,
+                        CuMatrixBase<BaseFloat> *in_deriv) const;
+
+  virtual void Read(std::istream &is, bool binary);
+  virtual void Write(std::ostream &os, bool binary) const;
+
+  virtual Component* Copy() const;
+
+  // Some functions from base-class UpdatableComponent.
+  virtual void Scale(BaseFloat scale);
+  virtual void Add(BaseFloat alpha, const Component &other);
+  virtual void PerturbParams(BaseFloat stddev);
+  virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
+  virtual int32 NumParameters() const;
+  virtual void Vectorize(VectorBase<BaseFloat> *params) const;
+  virtual void UnVectorize(const VectorBase<BaseFloat> &params);
+  virtual void ZeroStats();
+  virtual void FreezeNaturalGradient(bool freeze);
+
+  // Some functions that are specific to this class:
+  explicit LstmNonlinearityComponent(
+      const LstmNonlinearityComponent &other);
+
+  void Init(int32 cell_dim, bool use_dropout,
+            BaseFloat param_stddev,
+            BaseFloat tanh_self_repair_threshold,
+            BaseFloat sigmoid_self_repair_threshold,
+            BaseFloat self_repair_scale);
+
+  virtual void ConsolidateMemory();
+
+ private:
+
+  // Initializes the natural-gradient object with the configuration we
+  // use for this object, which for now is hardcoded at the C++ level.
+  void InitNaturalGradient();
+
+  // Notation: C is the cell dimension; it equals params_.NumCols().
+
+  // The dimension of the parameter matrix is (3 x C);
+  // it contains the 3 diagonal parameter matrices w_i, w_f and w_o.
+  CuMatrix<BaseFloat> params_;
+
+  // If true, we expect an extra 3 dimensions on the input, for dropout masks
+  // for i_t and f_t.
+  bool use_dropout_;
+
+  // Of dimension 5 * C, with a row for each of the Sigmoid/Tanh functions in
+  // equations (1) through (5), this is the sum of the values of the nonliearities
+  // (used for diagnostics only).  It is comparable to value_sum_ vector
+  // in base-class NonlinearComponent.
+  CuMatrix<double> value_sum_;
+
+  // Of dimension 5 * C, with a row for each of the Sigmoid/Tanh functions in
+  // equations (1) through (5), this is the sum of the derivatives of the
+  // nonliearities (used for diagnostics and to control self-repair).  It is
+  // comparable to the deriv_sum_ vector in base-class
+  // NonlinearComponent.
+  CuMatrix<double> deriv_sum_;
+
+  // This matrix has dimension 10.  The contents are a block of 5 self-repair
+  // thresholds (typically "0.05 0.05 0.2 0.05 0.2"), then a block of 5
+  // self-repair scales (typically all 0.00001).  These are for each of the 5
+  // nonlinearities in the LSTM component in turn (see comments in cu-math.h for
+  // more info).
+  CuVector<BaseFloat> self_repair_config_;
+
+  // This matrix has dimension 5.  For each of the 5 nonlinearities in the LSTM
+  // component (see comments in cu-math.h for more info), it contains the total,
+  // over all frames represented in count_, of the number of dimensions that
+  // were subject to self_repair.  To get the self-repair proportion you should
+  // divide by (count_ times cell_dim_).
+  CuVector<double> self_repair_total_;
+
+  // The total count (number of frames) corresponding to the stats in value_sum_
+  // and deriv_sum_.
+  double count_;
+
+  // Preconditioner for the parameters of this component [operates in the space
+  // of dimension C].
+  // The preconditioner stores its own configuration values; we write and read
+  // these, but not the preconditioner object itself.
+  OnlineNaturalGradient preconditioner_;
+
+  const LstmNonlinearityComponent &operator
+      = (const LstmNonlinearityComponent &other); // Disallow.
+};
+
+
+
+
+/*
+ * WARNING, this component is deprecated as it's not compatible with
+ *   TimeHeightConvolutionComponent, and it will eventually be deleted.
+ * MaxPoolingComponent :
+ * Maxpooling component was firstly used in ConvNet for selecting an
+ * representative activation in an area. It inspired Maxout nonlinearity.
+ * Each output element of this component is the maximum of a block of
+ * input elements where the block has a 3D dimension (pool_x_size_,
+ * pool_y_size_, pool_z_size_).
+ * Blocks could overlap if the shift value on any axis is smaller
+ * than its corresponding pool size (e.g. pool_x_step_ < pool_x_size_).
+ * If the shift values are euqal to their pool size, there is no
+ * overlap; while if they all equal 1, the blocks overlap to
+ * the greatest possible extent.
+ *
+ * This component is designed to be used after a ConvolutionComponent
+ * so that the input matrix is propagated from a 2d-convolutional layer.
+ * This component implements 3d-maxpooling which performs
+ * max pooling along the three axes.
+ * Input : A matrix where each row is a vectorized 3D-tensor.
+ *        The 3D tensor has dimensions
+ *        x: (e.g. time)
+ *        y: (e.g. frequency)
+ *        z: (e.g. channels like number of filters in the ConvolutionComponent)
+ *
+ *        The component assumes input vectorizations of type zyx
+ *        which is the default output vectorization type of a ConvolutionComponent.
+ *        e.g. for input vectorization of type zyx the input is vectorized by
+ *        spanning axes z, y and x of the tensor in that order.
+ *        Given 3d tensor A with sizes (2, 2, 2) along the three dimensions
+ *        the zyx vectorized input looks like
+ *  A(0,0,0) A(0,0,1) A(0,1,0) A(0,1,1) A(1,0,0) A(1,0,1) A(1,1,0) A(1,1,1)
+ *
+ * Output : The output is also a 3D tensor vectorized in the zyx format.
+ *
+ * For information on the hyperparameters and parameters of this component see
+ * the variable declarations.
+ *
+ *
+ */
+class MaxpoolingComponent: public Component {
+ public:
+
+  MaxpoolingComponent(): input_x_dim_(0), input_y_dim_(0), input_z_dim_(0),
+                           pool_x_size_(0), pool_y_size_(0), pool_z_size_(0),
+                           pool_x_step_(0), pool_y_step_(0), pool_z_step_(0) { }
+  // constructor using another component
+  MaxpoolingComponent(const MaxpoolingComponent &component);
+
+  virtual int32 InputDim() const;
+  virtual int32 OutputDim() const;
+
+  virtual std::string Info() const;
+  virtual void InitFromConfig(ConfigLine *cfl);
+  virtual std::string Type() const { return "MaxpoolingComponent"; }
+  virtual int32 Properties() const {
+    return kSimpleComponent|kBackpropNeedsInput|kBackpropNeedsOutput|
+           kBackpropAdds;
+  }
+
+  virtual void* Propagate(const ComponentPrecomputedIndexes *indexes,
+                         const CuMatrixBase<BaseFloat> &in,
+                         CuMatrixBase<BaseFloat> *out) const;
+  virtual void Backprop(const std::string &debug_info,
+                        const ComponentPrecomputedIndexes *indexes,
+                        const CuMatrixBase<BaseFloat> &in_value,
+                        const CuMatrixBase<BaseFloat> &out_value,
+                        const CuMatrixBase<BaseFloat> &out_deriv,
+                        void *memo,
+                        Component *, // to_update,
+                        CuMatrixBase<BaseFloat> *in_deriv) const;
+
+  virtual void Read(std::istream &is, bool binary); // This Read function
+  // requires that the Component has the correct type.
+
+  /// Write component to stream
+  virtual void Write(std::ostream &os, bool binary) const;
+  virtual Component* Copy() const { return new MaxpoolingComponent(*this); }
+
+
+ protected:
+  void InputToInputPatches(const CuMatrixBase<BaseFloat>& in,
+                           CuMatrix<BaseFloat> *patches) const;
+  void InderivPatchesToInderiv(const CuMatrix<BaseFloat>& in_deriv_patches,
+                               CuMatrixBase<BaseFloat> *in_deriv) const;
+  virtual void Check() const;
+
+
+  int32 input_x_dim_;   // size of the input along x-axis
+  // (e.g. number of time steps)
+  int32 input_y_dim_;   // size of input along y-axis
+  // (e.g. number of mel-frequency bins)
+  int32 input_z_dim_;   // size of input along z-axis
+  // (e.g. number of filters in the ConvolutionComponent)
+
+  int32 pool_x_size_;    // size of the pooling window along x-axis
+  int32 pool_y_size_;    // size of the pooling window along y-axis
+  int32 pool_z_size_;    // size of the pooling window along z-axis
+
+  int32 pool_x_step_;   // the number of steps taken along x-axis of input
+  //  before computing the next pool
+  int32 pool_y_step_;   // the number of steps taken along y-axis of input
+  // before computing the next pool
+  int32 pool_z_step_;   // the number of steps taken along z-axis of input
+  // before computing the next pool
+
+};
+
+
+/**
+  GruNonlinearityComponent is a component that implements part of a
+  Gated Recurrent Unit (GRU).  This is more efficient in time and
+  memory than stitching it together using more basic components.
+  For a brief summary of what this actually computes, search
+  for 'recap' below; the first part of this comment establishes
+  the context.
+
+  This component supports two cases: the regular GRU
+ (as described in "Empirical Evaluation of
+ Gated Recurrent Neural Networks on Sequence Modeling",
+ https://arxiv.org/pdf/1412.3555.pdf),
+  and our "projected GRU" which takes ideas from the
+ paper we'll abbreviate as "LSTM based RNN architectures for LVCSR",
+ https://arxiv.org/pdf/1402.1128.pdf.
+
+ Before describing what this component does, we'll establish
+ some notation for the GRU.
+
+ First, the regular (non-projected) GRU.  In order to unify the notation with
+ our "projected GRU", we'll use slightly different variable names.  We'll also
+ ignore the bias terms for purposes of this exposition (let them be implicit).
+
+
+  Regular GRU:
+
+   z_t = \sigmoid ( U^z x_t + W^z y_{t-1} )   # update gate, dim == cell_dim
+   r_t = \sigmoid ( U^r x_t + W^r y_{t-1} )   # reset gate, dim == cell_dim
+   h_t = \tanh ( U^h x_t + W^h ( y_{t-1} \dot r_t ) )   # dim == cell_dim
+   y_t = ( 1 - z_t ) \dot h_t  +  z_t \dot y_{t-1}  # dim == cell_dim
+
+ For the "projected GRU", the 'cell_dim x cell_dim' full-matrix expressions W^z
+ W^r and W^h that participate in the expressions for z_t, r_t and h_t are
+ replaced with skinny matrices of dimension 'cell_dim x recurrent_dim'
+ (where recurrent_dim < cell_dim) and the output is replaced by
+ a lower-dimension projection of the hidden state, of dimension
+ 'recurrent_dim + non_recurrent_dim < cell_dim', instead of the
+ full 'cell_dim'.  We rename y_t to c_t (this name is inspired by LSTMs), and
+ we now let the output (still called y_t) be a projection of c_t.
+ s_t is a dimension range of the output y_t.    Parameters of the
+ projected GRU:
+           cell_dim > 0
+           recurrent_dim > 0
+           non_recurrent_dim > 0  (where non_recurrent_dim + recurrent_dim < cell_dim).
+
+
+  Equations:
+
+   z_t = \sigmoid ( U^z x_t + W^z s_{t-1} )   # update gate, dim(z_t) == cell_dim
+   r_t = \sigmoid ( U^r x_t + W^r s_{t-1} )   # reset gate, dim(r_t) == recurrent_dim
+   h_t = \tanh ( U^h x_t + W^h ( s_{t-1} \dot r_t ) )   # dim(h_t) == cell_dim
+   c_t = ( 1 - z_t ) \dot h_t  +  z_t \dot c_{t-1}  # dim(c_t) == cell_dim
+   y_t = W^y c_t      # dim(y_t) = recurrent_dim + non_recurrent_dim.  This is
+                      # the output of the GRU.
+   s_t = y_t[0:recurrent_dim-1]  # dimension range of y_t, dim(s_t) = recurrent_dim.
+
+
+   Because we'll need it below, we define
+    hpart_t = U^h x_t
+   which is a subexpression of h_t.
+
+   Our choice to make a "special" component for the projected GRU is to have
+   it be a function from
+     (z_t, r_t, hpart_t, c_{t-1}, s_{t-1}) -> (h_t, c_t)
+   That is, the input to the component is all those things on the LHS
+   appended together, and the output is the two things on the
+   RHS appended together.  The dimensions are:
+    (cell_dim, recurrent_dim, cell_dim, cell_dim, recurrent_dim) -> (cell_dim, cell_dim).
+   The component computes the functions:
+     h_t = \tanh( hpart_t + W^h (s_{t-1} \dot r_t))
+     c_t = (1 - z_t) \dot h_t + z_t \dot c_{t-1}.
+
+   Notice that 'W^h' is the only parameter that lives inside the component.
+
+   You might also notice that the output 'h_t' is never actually used
+   in any other part of the GRU, so the question arises: why is it
+   necessary to have it be an output of the component?  This has to do with
+   saving computation: because h_t is an output, and we'll be defining
+   the kBackpropNeedsOutput flag, it is available in the backprop phase
+   and this helps us avoid some computation (otherwise we'd have to do
+   a redundant multiplication by W^h in the backprop phase that we already
+   did in the forward phase).  We could have used the 'memo' mechanism to
+   do this, but this is undesirable because the use of a memo disables
+   'update consolidation' in the backprop so we'd lose a little
+   speed there.
+
+   In the case where it's a regular, not projected GRU, this component
+   is a function from
+      (z_t, r_t, hpart_t, y_{t-1}) -> (h_t, y_t)
+   We can actually do this with the same code as the projected-GRU code,
+   we just make sure that recurrent_dim == cell_dim, and the only structural
+   difference is that c_{t-1} and s_{t-1} become the same variable (y_{t-1}),
+   and we rename c_t to y_t.
+
+   This component stores stats of the same form as are normally stored by the
+   StoreStats() functions for the sigmoid and tanh units, i.e. averages of the
+   activations and derivatives, but this is done inside the Backprop() functions.
+
+
+  The main configuration values that are accepted:
+         cell-dim         e.g. cell-dim=1024  Cell dimension.
+         recurrent-dim    e.g. recurrent-dim=256.  If not specified, we assume
+                          this is a non-projected GRU.
+         param-stddev     Standard deviation for random initialization of
+                          the matrix W^h.  Defaults to 1.0 / sqrt(d) where
+                          d is recurrent-dim if specified, else cell-dim.
+         self-repair-threshold   Equivalent to the self-repair-lower-threshold
+                          in a TanhComponent; applies to the tanh nonlinearity.
+                          default=0.2, you probably won't want to change this.
+         self-repair-scale Equivalent to the self-repair-scale in a
+                          TanhComponent; applies to the tanh nonlinearity.
+                          default=1.0e-05, which you probably won't want to
+                          change unless dealing with an objective function that
+                          has smaller or larger dynamic range than normal, in
+                          which case you might want to make it smaller or
+                          larger.
+
+  Values inherited from UpdatableComponent (see its declaration in
+  nnet-component-itf.h for details):
+      learning-rate
+      learning-rate-factor
+      max-change
+
+   Natural-gradient related options are below; you won't normally have to
+   set these.
+      alpha                 Constant that determines how much we smooth the
+                            Fisher-matrix estimates with the unit matrix.
+                            Larger means more smoothing. default=4.0
+      rank-in               Rank used in low-rank-plus-unit estimate of Fisher
+                            matrix in the input space.  default=20.
+      rank-out              Rank used in low-rank-plus-unit estimate of Fisher
+                            matrix in the output-derivative space.  default=80.
+      update-period         Determines the period (in minibatches) with which
+                            we update the Fisher-matrix estimates;
+                            making this > 1 saves a little time in training.
+                            default=4.
+
+
+   Recap of what this computes:
+      If recurrent-dim is specified, this component implements
+      the function
+           (z_t, r_t, hpart_t, c_{t-1}, s_{t-1}) -> (h_t, c_t)
+     of dims:
+   (cell_dim, recurrent_dim, cell_dim, cell_dim, recurrent_dim) -> (cell_dim, cell_dim).
+    where:
+         h_t = \tanh( hpart_t + W^h (s_{t-1} \dot r_t))
+         c_t = (1 - z_t) \dot h_t + z_t \dot c_{t-1}.
+     If recurrent-dim is not specified, this component implements
+     the function
+        (z_t, r_t, hpart_t, y_{t-1}) -> (h_t, y_t)
+   of dimensions
+       (cell_dim, cell_dim, cell_dim, cell_dim) -> (cell_dim, cell_dim),
+    where:
+         h_t = \tanh( hpart_t + W^h (y_{t-1} \dot r_t))
+         y_t = (1 - z_t) \dot h_t + z_t \dot y_{t-1}.
+*/
+class GruNonlinearityComponent: public UpdatableComponent {
+ public:
+
+  virtual int32 InputDim() const;
+  virtual int32 OutputDim() const;
+  virtual std::string Info() const;
+  virtual void InitFromConfig(ConfigLine *cfl);
+  GruNonlinearityComponent() { }
+  virtual std::string Type() const { return "GruNonlinearityComponent"; }
+  virtual int32 Properties() const {
+    return kSimpleComponent|kUpdatableComponent|kBackpropNeedsInput|\
+        kBackpropNeedsOutput|kBackpropAdds;
+  }
+  virtual void* Propagate(const ComponentPrecomputedIndexes *indexes,
+                         const CuMatrixBase<BaseFloat> &in,
+                         CuMatrixBase<BaseFloat> *out) const;
+  virtual void Backprop(const std::string &debug_info,
+                        const ComponentPrecomputedIndexes *indexes,
+                        const CuMatrixBase<BaseFloat> &in_value,
+                        const CuMatrixBase<BaseFloat> &, // out_value,
+                        const CuMatrixBase<BaseFloat> &out_deriv,
+                        void *memo,
+                        Component *to_update_in,
+                        CuMatrixBase<BaseFloat> *in_deriv) const;
+
+  virtual void Read(std::istream &is, bool binary);
+  virtual void Write(std::ostream &os, bool binary) const;
+
+  virtual Component* Copy() const { return new GruNonlinearityComponent(*this); }
+
+  virtual void Scale(BaseFloat scale);
+  virtual void Add(BaseFloat alpha, const Component &other);
+
+  // Some functions from base-class UpdatableComponent.
+  virtual void PerturbParams(BaseFloat stddev);
+  virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
+  virtual int32 NumParameters() const;
+  virtual void Vectorize(VectorBase<BaseFloat> *params) const;
+  virtual void UnVectorize(const VectorBase<BaseFloat> &params);
+  virtual void ZeroStats();
+  virtual void FreezeNaturalGradient(bool freeze);
+
+  // Some functions that are specific to this class:
+  explicit GruNonlinearityComponent(
+      const GruNonlinearityComponent &other);
+
+ private:
+
+  void Check() const;  // checks dimensions, etc.
+
+  /**
+     This function stores value and derivative stats for the tanh
+     nonlinearity that is a part of this component, and if needed
+     adds the small 'self-repair' term to 'h_t_deriv'.
+      @param [in] h_t The output of the tanh expression from the
+                      forward pass.
+      @param [in,out] h_t_deriv  To here will be added the small
+                      self-repair term (this is a small value
+                      that we use to push oversaturated neurons
+                      back to the center).
+     This function has side effects on the class instance, specifically the
+     members value_sum_, deriv_sum, self_repair_total_, and count_.
+   */
+  void TanhStatsAndSelfRepair(const CuMatrixBase<BaseFloat> &h_t,
+                              CuMatrixBase<BaseFloat> *h_t_deriv);
+
+  /*  This function is responsible for updating the w_h_ matrix
+      (taking into account the learning rate).
+        @param [in] sdotr  The value of the expression (s_{t-1} \dot r_t).
+        @param [in] h_t_deriv  The derivative of the objective
+                        function w.r.t. the argument of the tanh
+                        function, i.e. w.r.t. the expression
+                        "hpart_t + W^h (s_{t-1} \dot r_t)".
+                        This function is concerned with the second
+                        term as it affects the derivative w.r.t. W^h.
+   */
+  void UpdateParameters(const CuMatrixBase<BaseFloat> &sdotr,
+                        const CuMatrixBase<BaseFloat> &h_t_deriv);
+
+
+  int32 cell_dim_;  // cell dimension, e.g. 1024.
+  int32 recurrent_dim_;  // recurrent dimension, e.g. 256 for projected GRU;
+                         // if it's the same as cell_dim it means we are
+                         // implementing regular (non-projected) GRU
+
+
+  // The matrix W^h, of dimension cell_dim_ by recurrent_dim_.
+  // There is no bias term needed here because hpart_t comes from
+  // an affine component that has a bias.
+  CuMatrix<BaseFloat> w_h_;
+
+  // Of dimension cell_dim_, this is comparable to the value_sum_ vector in
+  // class NonlinearComponent.  It stores the sum of the tanh nonlinearity.
+  // Normalize by dividing by count_.
+  CuVector<double> value_sum_;
+
+  // Of dimension cell_dim_, this is comparable to the deriv_sum_ vector in
+  // class NonlinearComponent.  It stores the sum of the function-derivative of
+  // the tanh nonlinearity.  Normalize by dividing by count_.
+  CuVector<double> deriv_sum_;
+
+  // This is part of the stats (along with value_sum_, deriv_sum_, and count_);
+  // if you divide it by count_ it gives you the proportion of the time that an
+  // average dimension was subject to self-repair.
+  double self_repair_total_;
+
+  // The total count (number of frames) corresponding to the stats in value_sum_,
+  // deriv_sum_, and self_repair_total_.
+  double count_;
+
+  // A configuration parameter, this determines how saturated the derivative
+  // has to be for a particular dimension, before we activate self-repair.
+  // Default value is 0.2, the same as for TanhComponent.
+  BaseFloat self_repair_threshold_;
+
+  // A configuration parameter, this determines the maximum absolute value of
+  // the extra term that we add to the input derivative of the tanh when doing
+  // self repair.  The default value is 1.0e-05.
+  BaseFloat self_repair_scale_;
+
+  // Preconditioner for the input space when updating w_h_ (has dimension
+  // recurrent_dim_ if use-natural-gradient was true, else not set up).
+  // The preconditioner stores its own configuration values; we write and read
+  // these, but not the preconditioner object itself.
+  OnlineNaturalGradient preconditioner_in_;
+  // Preconditioner for the output space when updating w_h_ (has dimension
+  // recurrent_dim_ if use-natural-gradient was true, else not set up).
+
+  OnlineNaturalGradient preconditioner_out_;
+
+  const GruNonlinearityComponent &operator
+      = (const GruNonlinearityComponent &other); // Disallow.
+};
+
+
+/**
+  OutputGruNonlinearityComponent is a component that implements part of a
+  Output Gated Recurrent Unit (OGRU).  Compare with the traditional GRU, it uses
+  output gate instead reset gate, and the formula of h_t will be different. 
+  You can regard it as a variant of GRU.
+  This code is more efficient in time and memory than stitching it together
+  using more basic components.
+  For a brief summary of what this actually computes, search for 'recap' below;
+  the first part of this comment establishes the context. For more information
+  about GRU, please check the summary of GruNonlinearityComponent.
+
+ Before describing what this component does, we'll establish
+ some notation for the OGRU.
+
+ We use the same notation with previous GRU. We'll also
+ ignore the bias terms for purposes of this exposition (let them be implicit).
+
+
+  Regular OGRU:
+
+   z_t = \sigmoid ( U^z x_t + W^z y_{t-1} )   # update gate, dim == cell_dim
+   o_t = \sigmoid ( U^o x_t + W^o y_{t-1} )   # output gate, dim == cell_dim
+   h_t = \tanh ( U^h x_t + W^h \dot c_{t-1} )   # dim == cell_dim
+   c_t = ( 1 - z_t ) \dot h_t  +  z_t \dot c_{t-1}  # dim == cell_dim
+   y_t = ( c_t \dot o_t )
+
+ For the "projected OGRU", the 'cell_dim x cell_dim' full-matrix expressions W^z
+ W^o that participate in the expressions for z_t, o_t are
+ replaced with skinny matrices of dimension 'cell_dim x recurrent_dim'
+ (where recurrent_dim < cell_dim) and the output is replaced by
+ a lower-dimension projection of the hidden state, of dimension
+ 'recurrent_dim + non_recurrent_dim < cell_dim', instead of the
+ full 'cell_dim'.
+ s_t is a dimension range of the output y_t.    Parameters of the
+ projected OGRU:
+           cell_dim > 0
+           recurrent_dim > 0
+           non_recurrent_dim > 0  (where non_recurrent_dim + recurrent_dim <= cell_dim).
+
+
+  Equations:
+
+   z_t = \sigmoid ( U^z x_t + W^z s_{t-1} )   # update gate, dim(z_t) == cell_dim
+   o_t = \sigmoid ( U^o x_t + W^o s_{t-1} )   # output gate, dim(o_t) == cell_dim
+   h_t = \tanh ( U^h x_t + W^h \dot c_{t-1} )   # dim(h_t) == cell_dim
+   c_t = ( 1 - z_t ) \dot h_t  +  z_t \dot c_{t-1}  # dim(c_t) == cell_dim
+   y_t = ( c_t \dot o_t) W^y  # dim(y_t) = recurrent_dim + non_recurrent_dim.
+                              # This is the output of the OGRU.
+   s_t = y_t[0:recurrent_dim-1]  # dimension range of y_t, dim(s_t) = recurrent_dim.
+
+
+   Because we'll need it below, we define
+    hpart_t = U^h x_t
+   which is a subexpression of h_t.
+
+   Our choice to make a "special" component for the projected OGRU is to have
+   it be a function from
+     (z_t, hpart_t, c_{t-1}) -> (h_t, c_t)
+   That is, the input to the component is all those things on the LHS
+   appended together, and the output is the two things on the
+   RHS appended together.  The dimensions are:
+    (cell_dim, cell_dim, cell_dim) -> (cell_dim, cell_dim).
+   The component computes the functions:
+     h_t = \tanh ( U^h x_t + W^h \dot c_{t-1} )
+     c_t = ( 1 - z_t ) \dot h_t  +  z_t \dot c_{t-1}
+
+   Notice that 'W^h' is the only parameter that lives inside the component.
+
+   You might also notice that the output 'h_t' is never actually used
+   in any other part of the GRU, so the question arises: why is it
+   necessary to have it be an output of the component?  This has to do with
+   saving computation: because h_t is an output, and we'll be defining
+   the kBackpropNeedsOutput flag, it is available in the backprop phase
+   and this helps us avoid some computation (otherwise we'd have to do
+   a redundant multiplication by W^h in the backprop phase that we already
+   did in the forward phase).  We could have used the 'memo' mechanism to
+   do this, but this is undesirable because the use of a memo disables
+   'update consolidation' in the backprop so we'd lose a little
+   speed there.
+
+   This component stores stats of the same form as are normally stored by the
+   StoreStats() functions for the sigmoid and tanh units, i.e. averages of the
+   activations and derivatives, but this is done inside the Backprop() functions.
+
+
+  The main configuration values that are accepted:
+         cell-dim         e.g. cell-dim=1024  Cell dimension.
+         recurrent-dim    e.g. recurrent-dim=256.  If not specified, we assume
+                          this is a non-projected GRU.
+         param-stddev     Standard deviation for random initialization of
+                          the matrix W^h.  Defaults to 1.0 / sqrt(d) where
+                          d is recurrent-dim if specified, else cell-dim.
+         self-repair-threshold   Equivalent to the self-repair-lower-threshold
+                          in a TanhComponent; applies to the tanh nonlinearity.
+                          default=0.2, you probably won't want to change this.
+         self-repair-scale Equivalent to the self-repair-scale in a
+                          TanhComponent; applies to the tanh nonlinearity.
+                          default=1.0e-05, which you probably won't want to
+                          change unless dealing with an objective function that
+                          has smaller or larger dynamic range than normal, in
+                          which case you might want to make it smaller or
+                          larger.
+
+  Values inherited from UpdatableComponent (see its declaration in
+  nnet-component-itf.h for details):
+      learning-rate
+      learning-rate-factor
+      max-change
+
+   Natural-gradient related options are below; you won't normally have to
+   set these.
+      alpha                 Constant that determines how much we smooth the
+                            Fisher-matrix estimates with the unit matrix.
+                            Larger means more smoothing. default=4.0
+      rank                  The rank of the correction to the unit matrix.
+                            default=8.
+      update-period         Determines the period (in minibatches) with which
+                            we update the Fisher-matrix estimates;
+                            making this > 1 saves a little time in training.
+                            default=10.
+
+
+   Recap of what this computes:
+     This component implements the function
+        (z_t, hpart_t, c_{t-1}) -> (h_t, c_t)
+     of dimensions
+        (cell_dim, cell_dim, cell_dim) -> (cell_dim, cell_dim),
+    where:
+         h_t = \tanh( hpart_t + W^h \dot c_{t-1} )
+         c_t = (1 - z_t) \dot h_t + z_t \dot c_{t-1}.
+*/
+class OutputGruNonlinearityComponent: public UpdatableComponent {
+ public:
+
+  virtual int32 InputDim() const;
+  virtual int32 OutputDim() const;
+  virtual std::string Info() const;
+  virtual void InitFromConfig(ConfigLine *cfl);
+  OutputGruNonlinearityComponent() { }
+  virtual std::string Type() const { return "OutputGruNonlinearityComponent"; }
+  virtual int32 Properties() const {
+    return kSimpleComponent|kUpdatableComponent|kBackpropNeedsInput|\
+        kBackpropNeedsOutput|kBackpropAdds;
+  }
+  virtual void* Propagate(const ComponentPrecomputedIndexes *indexes,
+                         const CuMatrixBase<BaseFloat> &in,
+                         CuMatrixBase<BaseFloat> *out) const;
+  virtual void Backprop(const std::string &debug_info,
+                        const ComponentPrecomputedIndexes *indexes,
+                        const CuMatrixBase<BaseFloat> &in_value,
+                        const CuMatrixBase<BaseFloat> &, // out_value,
+                        const CuMatrixBase<BaseFloat> &out_deriv,
+                        void *memo,
+                        Component *to_update_in,
+                        CuMatrixBase<BaseFloat> *in_deriv) const;
+
+  virtual void Read(std::istream &is, bool binary);
+  virtual void Write(std::ostream &os, bool binary) const;
+
+  virtual Component* Copy() const { return new OutputGruNonlinearityComponent(*this); }
+
+  virtual void Scale(BaseFloat scale);
+  virtual void Add(BaseFloat alpha, const Component &other);
+
+  // Some functions from base-class UpdatableComponent.
+  virtual void PerturbParams(BaseFloat stddev);
+  virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
+  virtual int32 NumParameters() const;
+  virtual void Vectorize(VectorBase<BaseFloat> *params) const;
+  virtual void UnVectorize(const VectorBase<BaseFloat> &params);
+  virtual void ZeroStats();
+  virtual void FreezeNaturalGradient(bool freeze);
+
+  // Some functions that are specific to this class:
+  explicit OutputGruNonlinearityComponent(
+      const OutputGruNonlinearityComponent &other);
+
+ private:
+
+  void Check() const;  // checks dimensions, etc.
+
+  /**
+     This function stores value and derivative stats for the tanh
+     nonlinearity that is a part of this component, and if needed
+     adds the small 'self-repair' term to 'h_t_deriv'.
+      @param [in] h_t The output of the tanh expression from the
+                      forward pass.
+      @param [in,out] h_t_deriv  To here will be added the small
+                      self-repair term (this is a small value
+                      that we use to push oversaturated neurons
+                      back to the center).
+     This function has side effects on the class instance, specifically the
+     members value_sum_, deriv_sum, self_repair_total_, and count_.
+   */
+  void TanhStatsAndSelfRepair(const CuMatrixBase<BaseFloat> &h_t,
+                              CuMatrixBase<BaseFloat> *h_t_deriv);
+
+  /*  This function is responsible for updating the w_h_ matrix
+      (taking into account the learning rate).
+        @param [in] c_t1_value  The value of c_{t-1}.
+        @param [in] h_t_deriv  The derivative of the objective
+                        function w.r.t. the argument of the tanh
+                        function, i.e. w.r.t. the expression
+                        "hpart_t + W^h \dot c_t1".
+                        This function is concerned with the second
+                        term as it affects the derivative w.r.t. W^h.
+   */
+  void UpdateParameters(const CuMatrixBase<BaseFloat> &c_t1_value,
+                        const CuMatrixBase<BaseFloat> &h_t_deriv);
+
+
+  int32 cell_dim_;  // cell dimension, e.g. 1024.
+
+  // The matrix W^h, of dimension cell_dim_ by recurrent_dim_.
+  // There is no bias term needed here because hpart_t comes from
+  // an affine component that has a bias.
+  CuVector<BaseFloat> w_h_;
+
+  // Of dimension cell_dim_, this is comparable to the value_sum_ vector in
+  // class NonlinearComponent.  It stores the sum of the tanh nonlinearity.
+  // Normalize by dividing by count_.
+  CuVector<double> value_sum_;
+
+  // Of dimension cell_dim_, this is comparable to the deriv_sum_ vector in
+  // class NonlinearComponent.  It stores the sum of the function-derivative of
+  // the tanh nonlinearity.  Normalize by dividing by count_.
+  CuVector<double> deriv_sum_;
+
+  // This is part of the stats (along with value_sum_, deriv_sum_, and count_);
+  // if you divide it by count_ it gives you the proportion of the time that an
+  // average dimension was subject to self-repair.
+  double self_repair_total_;
+
+  // The total count (number of frames) corresponding to the stats in value_sum_,
+  // deriv_sum_, and self_repair_total_.
+  double count_;
+
+  // A configuration parameter, this determines how saturated the derivative
+  // has to be for a particular dimension, before we activate self-repair.
+  // Default value is 0.2, the same as for TanhComponent.
+  BaseFloat self_repair_threshold_;
+
+  // A configuration parameter, this determines the maximum absolute value of
+  // the extra term that we add to the input derivative of the tanh when doing
+  // self repair.  The default value is 1.0e-05.
+  BaseFloat self_repair_scale_;
+
+  // Unlike the GruNonlinearityComponent, there is only one dimension to
+  // consider as the parameters are a vector not a matrix, so we only need one
+  // preconditioner.
+  OnlineNaturalGradient preconditioner_;
+
+  const OutputGruNonlinearityComponent &operator
+      = (const OutputGruNonlinearityComponent &other); // Disallow.
+};
+
+
+} // namespace nnet3
+} // namespace kaldi
+
+
+#endif
diff --git a/src/nnet3/nnet-component-itf.cc b/src/nnet3/nnet-component-itf.cc
index d2d325d22f1..1ff7daa01d1 100644
--- a/src/nnet3/nnet-component-itf.cc
+++ b/src/nnet3/nnet-component-itf.cc
@@ -23,6 +23,7 @@
 #include <iomanip>
 #include "nnet3/nnet-component-itf.h"
 #include "nnet3/nnet-simple-component.h"
+#include "nnet3/nnet-combined-component.h"
 #include "nnet3/nnet-normalize-component.h"
 #include "nnet3/nnet-general-component.h"
 #include "nnet3/nnet-convolutional-component.h"
@@ -178,6 +179,10 @@ Component* Component::NewComponentOfType(const std::string &component_type) {
     ans = new RestrictedAttentionComponent();
   } else if (component_type == "SumBlockComponent") {
     ans = new SumBlockComponent();
+  } else if (component_type == "GruNonlinearityComponent") {
+    ans = new GruNonlinearityComponent();
+  } else if (component_type == "OutputGruNonlinearityComponent") {
+    ans = new OutputGruNonlinearityComponent();
   } else if (component_type == "ScaleAndOffsetComponent") {
     ans = new ScaleAndOffsetComponent();
   }
diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc
index 7a5eb7017a3..e8c99494b06 100644
--- a/src/nnet3/nnet-simple-component.cc
+++ b/src/nnet3/nnet-simple-component.cc
@@ -1,6 +1,6 @@
 // nnet3/nnet-simple-component.cc
 
-// Copyright      2015  Johns Hopkins University (author: Daniel Povey)
+// Copyright 2015-2017  Johns Hopkins University (author: Daniel Povey)
 //                2015  Xiaohui Zhang
 //                2015  Guoguo Chen
 //                2015  Daniel Galvez
@@ -3942,939 +3942,6 @@ void NaturalGradientPerElementScaleComponent::ConsolidateMemory() {
   preconditioner_.Swap(&temp);
 }
 
-// Constructors for the convolution component
-ConvolutionComponent::ConvolutionComponent():
-    UpdatableComponent(),
-    input_x_dim_(0), input_y_dim_(0), input_z_dim_(0),
-    filt_x_dim_(0), filt_y_dim_(0),
-    filt_x_step_(0), filt_y_step_(0),
-    input_vectorization_(kZyx) { }
-
-ConvolutionComponent::ConvolutionComponent(
-    const ConvolutionComponent &component):
-    UpdatableComponent(component),
-    input_x_dim_(component.input_x_dim_),
-    input_y_dim_(component.input_y_dim_),
-    input_z_dim_(component.input_z_dim_),
-    filt_x_dim_(component.filt_x_dim_),
-    filt_y_dim_(component.filt_y_dim_),
-    filt_x_step_(component.filt_x_step_),
-    filt_y_step_(component.filt_y_step_),
-    input_vectorization_(component.input_vectorization_),
-    filter_params_(component.filter_params_),
-    bias_params_(component.bias_params_) { }
-
-ConvolutionComponent::ConvolutionComponent(
-    const CuMatrixBase<BaseFloat> &filter_params,
-    const CuVectorBase<BaseFloat> &bias_params,
-    int32 input_x_dim, int32 input_y_dim, int32 input_z_dim,
-    int32 filt_x_dim, int32 filt_y_dim,
-    int32 filt_x_step, int32 filt_y_step,
-    TensorVectorizationType input_vectorization,
-    BaseFloat learning_rate):
-    input_x_dim_(input_x_dim),
-    input_y_dim_(input_y_dim),
-    input_z_dim_(input_z_dim),
-    filt_x_dim_(filt_x_dim),
-    filt_y_dim_(filt_y_dim),
-    filt_x_step_(filt_x_step),
-    filt_y_step_(filt_y_step),
-    input_vectorization_(input_vectorization),
-    filter_params_(filter_params),
-    bias_params_(bias_params){
-  KALDI_ASSERT(filter_params.NumRows() == bias_params.Dim() &&
-               bias_params.Dim() != 0);
-  KALDI_ASSERT(filter_params.NumCols() == filt_x_dim * filt_y_dim * input_z_dim);
-  SetUnderlyingLearningRate(learning_rate);
-  is_gradient_ = false;
-}
-
-// aquire input dim
-int32 ConvolutionComponent::InputDim() const {
-  return input_x_dim_ * input_y_dim_ * input_z_dim_;
-}
-
-// aquire output dim
-int32 ConvolutionComponent::OutputDim() const {
-  int32 num_x_steps = (1 + (input_x_dim_ - filt_x_dim_) / filt_x_step_);
-  int32 num_y_steps = (1 + (input_y_dim_ - filt_y_dim_) / filt_y_step_);
-  int32 num_filters = filter_params_.NumRows();
-  return num_x_steps * num_y_steps * num_filters;
-}
-
-// initialize the component using hyperparameters
-void ConvolutionComponent::Init(
-    int32 input_x_dim, int32 input_y_dim, int32 input_z_dim,
-    int32 filt_x_dim, int32 filt_y_dim,
-    int32 filt_x_step, int32 filt_y_step, int32 num_filters,
-    TensorVectorizationType input_vectorization,
-    BaseFloat param_stddev, BaseFloat bias_stddev) {
-  input_x_dim_ = input_x_dim;
-  input_y_dim_ = input_y_dim;
-  input_z_dim_ = input_z_dim;
-  filt_x_dim_ = filt_x_dim;
-  filt_y_dim_ = filt_y_dim;
-  filt_x_step_ = filt_x_step;
-  filt_y_step_ = filt_y_step;
-  input_vectorization_ = input_vectorization;
-  KALDI_ASSERT((input_x_dim_ - filt_x_dim_) % filt_x_step_ == 0);
-  KALDI_ASSERT((input_y_dim_ - filt_y_dim_) % filt_y_step_ == 0);
-  int32 filter_dim = filt_x_dim_ * filt_y_dim_ * input_z_dim_;
-  filter_params_.Resize(num_filters, filter_dim);
-  bias_params_.Resize(num_filters);
-  KALDI_ASSERT(param_stddev >= 0.0 && bias_stddev >= 0.0);
-  filter_params_.SetRandn();
-  filter_params_.Scale(param_stddev);
-  bias_params_.SetRandn();
-  bias_params_.Scale(bias_stddev);
-}
-
-// initialize the component using predefined matrix file
-void ConvolutionComponent::Init(
-    int32 input_x_dim, int32 input_y_dim, int32 input_z_dim,
-    int32 filt_x_dim, int32 filt_y_dim,
-    int32 filt_x_step, int32 filt_y_step,
-    TensorVectorizationType input_vectorization,
-    std::string matrix_filename) {
-  input_x_dim_ = input_x_dim;
-  input_y_dim_ = input_y_dim;
-  input_z_dim_ = input_z_dim;
-  filt_x_dim_ = filt_x_dim;
-  filt_y_dim_ = filt_y_dim;
-  filt_x_step_ = filt_x_step;
-  filt_y_step_ = filt_y_step;
-  input_vectorization_ = input_vectorization;
-  CuMatrix<BaseFloat> mat;
-  ReadKaldiObject(matrix_filename, &mat);
-  int32 filter_dim = (filt_x_dim_ * filt_y_dim_ * input_z_dim_);
-  int32 num_filters = mat.NumRows();
-  KALDI_ASSERT(mat.NumCols() == (filter_dim + 1));
-  filter_params_.Resize(num_filters, filter_dim);
-  bias_params_.Resize(num_filters);
-  filter_params_.CopyFromMat(mat.Range(0, num_filters, 0, filter_dim));
-  bias_params_.CopyColFromMat(mat, filter_dim);
-}
-
-// display information about component
-std::string ConvolutionComponent::Info() const {
-  std::ostringstream stream;
-  stream << UpdatableComponent::Info()
-         << ", input-x-dim=" << input_x_dim_
-         << ", input-y-dim=" << input_y_dim_
-         << ", input-z-dim=" << input_z_dim_
-         << ", filt-x-dim=" << filt_x_dim_
-         << ", filt-y-dim=" << filt_y_dim_
-         << ", filt-x-step=" << filt_x_step_
-         << ", filt-y-step=" << filt_y_step_
-         << ", input-vectorization=" << input_vectorization_
-         << ", num-filters=" << filter_params_.NumRows();
-  PrintParameterStats(stream, "filter-params", filter_params_);
-  PrintParameterStats(stream, "bias-params", bias_params_, true);
-  return stream.str();
-}
-
-// initialize the component using configuration file
-void ConvolutionComponent::InitFromConfig(ConfigLine *cfl) {
-  bool ok = true;
-  std::string matrix_filename;
-  int32 input_x_dim = -1, input_y_dim = -1, input_z_dim = -1,
-        filt_x_dim = -1, filt_y_dim = -1,
-        filt_x_step = -1, filt_y_step = -1,
-        num_filters = -1;
-  std::string input_vectorization_order = "zyx";
-  InitLearningRatesFromConfig(cfl);
-  ok = ok && cfl->GetValue("input-x-dim", &input_x_dim);
-  ok = ok && cfl->GetValue("input-y-dim", &input_y_dim);
-  ok = ok && cfl->GetValue("input-z-dim", &input_z_dim);
-  ok = ok && cfl->GetValue("filt-x-dim", &filt_x_dim);
-  ok = ok && cfl->GetValue("filt-y-dim", &filt_y_dim);
-  ok = ok && cfl->GetValue("filt-x-step", &filt_x_step);
-  ok = ok && cfl->GetValue("filt-y-step", &filt_y_step);
-
-  if (!ok)
-    KALDI_ERR << "Bad initializer " << cfl->WholeLine();
-  // optional argument
-  TensorVectorizationType input_vectorization;
-  cfl->GetValue("input-vectorization-order", &input_vectorization_order);
-  if (input_vectorization_order.compare("zyx") == 0) {
-    input_vectorization = kZyx;
-  } else if (input_vectorization_order.compare("yzx") == 0) {
-    input_vectorization = kYzx;
-  } else {
-    KALDI_ERR << "Unknown or unsupported input vectorization order "
-              << input_vectorization_order
-              << " accepted candidates are 'yzx' and 'zyx'";
-  }
-
-  if (cfl->GetValue("matrix", &matrix_filename)) {
-    // initialize from prefined parameter matrix
-    Init(input_x_dim, input_y_dim, input_z_dim,
-         filt_x_dim, filt_y_dim,
-         filt_x_step, filt_y_step,
-         input_vectorization,
-         matrix_filename);
-  } else {
-    ok = ok && cfl->GetValue("num-filters", &num_filters);
-    if (!ok)
-      KALDI_ERR << "Bad initializer " << cfl->WholeLine();
-    // initialize from configuration
-    int32 filter_input_dim = filt_x_dim * filt_y_dim * input_z_dim;
-    BaseFloat param_stddev = 1.0 / std::sqrt(filter_input_dim), bias_stddev = 1.0;
-    cfl->GetValue("param-stddev", &param_stddev);
-    cfl->GetValue("bias-stddev", &bias_stddev);
-    Init(input_x_dim, input_y_dim, input_z_dim,
-         filt_x_dim, filt_y_dim, filt_x_step, filt_y_step, num_filters,
-         input_vectorization, param_stddev, bias_stddev);
-  }
-  if (cfl->HasUnusedValues())
-    KALDI_ERR << "Could not process these elements in initializer: "
-              << cfl->UnusedValues();
-  if (!ok)
-    KALDI_ERR << "Bad initializer " << cfl->WholeLine();
-}
-
-// Inline methods to convert from tensor index i.e., (x,y,z) index
-// to index in yzx or zyx vectorized tensors
-inline int32 YzxVectorIndex(int32 x, int32 y, int32 z,
-                            int32 input_x_dim,
-                            int32 input_y_dim,
-                            int32 input_z_dim) {
-  KALDI_PARANOID_ASSERT(x < input_x_dim && y < input_y_dim && z < input_z_dim);
-  return (input_y_dim * input_z_dim) * x + (input_y_dim) * z + y;
-}
-
-inline int32 ZyxVectorIndex(int32 x, int32 y, int32 z,
-                            int32 input_x_dim,
-                            int32 input_y_dim,
-                            int32 input_z_dim) {
-  KALDI_PARANOID_ASSERT(x < input_x_dim && y < input_y_dim && z < input_z_dim);
-  return (input_y_dim * input_z_dim) * x + (input_z_dim) * y + z;
-}
-
-// Method to convert from a matrix representing a minibatch of vectorized
-// 3D tensors to patches for convolution, each patch corresponds to
-// one dot product in the convolution
-void ConvolutionComponent::InputToInputPatches(
-    const CuMatrixBase<BaseFloat>& in,
-    CuMatrix<BaseFloat> *patches) const{
-  int32 num_x_steps = (1 + (input_x_dim_ - filt_x_dim_) / filt_x_step_);
-  int32 num_y_steps = (1 + (input_y_dim_ - filt_y_dim_) / filt_y_step_);
-  const int32 filt_x_step = filt_x_step_,
-              filt_y_step = filt_y_step_,
-              filt_x_dim = filt_x_dim_,
-              filt_y_dim = filt_y_dim_,
-              input_x_dim = input_x_dim_,
-              input_y_dim = input_y_dim_,
-              input_z_dim = input_z_dim_,
-              filter_dim = filter_params_.NumCols();
-
-  std::vector<int32> column_map(patches->NumCols());
-  int32 column_map_size = column_map.size();
-  for (int32 x_step = 0; x_step < num_x_steps; x_step++) {
-    for (int32 y_step = 0; y_step < num_y_steps; y_step++)  {
-      int32 patch_number = x_step * num_y_steps + y_step;
-      int32 patch_start_index = patch_number * filter_dim;
-      for (int32 x = 0, index = patch_start_index; x < filt_x_dim; x++)  {
-        for (int32 y = 0; y < filt_y_dim; y++)  {
-          for (int32 z = 0; z < input_z_dim; z++, index++)  {
-            KALDI_ASSERT(index < column_map_size);
-            if (input_vectorization_ == kZyx)  {
-              column_map[index] = ZyxVectorIndex(x_step * filt_x_step + x,
-                                                 y_step * filt_y_step + y, z,
-                                                 input_x_dim, input_y_dim,
-                                                 input_z_dim);
-            } else if (input_vectorization_ == kYzx)  {
-              column_map[index] = YzxVectorIndex(x_step * filt_x_step + x,
-                                                  y_step * filt_y_step + y, z,
-                                                  input_x_dim, input_y_dim,
-                                                  input_z_dim);
-            }
-          }
-        }
-      }
-    }
-  }
-  CuArray<int32> cu_cols(column_map);
-  patches->CopyCols(in, cu_cols);
-}
-
-
-// propagation function
-// see function declaration in nnet-simple-component.h for details
-void* ConvolutionComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
-                                         const CuMatrixBase<BaseFloat> &in,
-                                         CuMatrixBase<BaseFloat> *out) const {
-  const int32 num_x_steps = (1 + (input_x_dim_ - filt_x_dim_) / filt_x_step_),
-              num_y_steps = (1 + (input_y_dim_ - filt_y_dim_) / filt_y_step_),
-              num_filters = filter_params_.NumRows(),
-              num_frames = in.NumRows(),
-              filter_dim = filter_params_.NumCols();
-  KALDI_ASSERT((*out).NumRows() == num_frames &&
-               (*out).NumCols() == (num_filters * num_x_steps * num_y_steps));
-
-  CuMatrix<BaseFloat> patches(num_frames,
-                              num_x_steps * num_y_steps * filter_dim,
-                              kUndefined);
-  InputToInputPatches(in, &patches);
-  CuSubMatrix<BaseFloat>* filter_params_elem = new CuSubMatrix<BaseFloat>(
-      filter_params_, 0, filter_params_.NumRows(), 0, filter_params_.NumCols());
-  std::vector<CuSubMatrix<BaseFloat>* > tgt_batch, patch_batch,
-      filter_params_batch;
-
-  for (int32 x_step = 0; x_step < num_x_steps; x_step++)  {
-    for (int32 y_step = 0; y_step < num_y_steps; y_step++)  {
-      int32 patch_number = x_step * num_y_steps + y_step;
-      tgt_batch.push_back(new CuSubMatrix<BaseFloat>(
-              out->ColRange(patch_number * num_filters, num_filters)));
-      patch_batch.push_back(new CuSubMatrix<BaseFloat>(
-              patches.ColRange(patch_number * filter_dim, filter_dim)));
-      filter_params_batch.push_back(filter_params_elem);
-      tgt_batch[patch_number]->AddVecToRows(1.0, bias_params_, 1.0); // add bias
-    }
-  }
-  // apply all filters
-  AddMatMatBatched<BaseFloat>(1.0, tgt_batch, patch_batch,
-                              kNoTrans, filter_params_batch,
-                              kTrans, 1.0);
-  // release memory
-  delete filter_params_elem;
-  for (int32 p = 0; p < tgt_batch.size(); p++) {
-    delete tgt_batch[p];
-    delete patch_batch[p];
-  }
-  return NULL;
-}
-
-// scale the parameters
-void ConvolutionComponent::Scale(BaseFloat scale) {
-  if (scale == 0.0) {
-    filter_params_.SetZero();
-    bias_params_.SetZero();
-  } else {
-    filter_params_.Scale(scale);
-    bias_params_.Scale(scale);
-  }
-}
-
-// add another convolution component
-void ConvolutionComponent::Add(BaseFloat alpha, const Component &other_in) {
-  const ConvolutionComponent *other =
-      dynamic_cast<const ConvolutionComponent*>(&other_in);
-  KALDI_ASSERT(other != NULL);
-  filter_params_.AddMat(alpha, other->filter_params_);
-  bias_params_.AddVec(alpha, other->bias_params_);
-}
-
-/*
- This function transforms a vector of lists into a list of vectors,
- padded with -1.
- @param[in] The input vector of lists. Let in.size() be D, and let
-            the longest list length (i.e. the max of in[i].size()) be L.
- @param[out] The output list of vectors. The length of the list will
-            be L, each vector-dimension will be D (i.e. out[i].size() == D),
-            and if in[i] == j, then for some k we will have that
-            out[k][j] = i. The output vectors are padded with -1
-            where necessary if not all the input lists have the same side.
-*/
-void RearrangeIndexes(const std::vector<std::vector<int32> > &in,
-                                                std::vector<std::vector<int32> > *out) {
-  int32 D = in.size();
-  int32 L = 0;
-  for (int32 i = 0; i < D; i++)
-    if (in[i].size() > L)
-      L = in[i].size();
-  out->resize(L);
-  for (int32 i = 0; i < L; i++)
-    (*out)[i].resize(D, -1);
-  for (int32 i = 0; i < D; i++) {
-    for (int32 j = 0; j < in[i].size(); j++) {
-      (*out)[j][i] = in[i][j];
-    }
-  }
-}
-
-// Method to compute the input derivative matrix from the input derivatives
-// for patches, where each patch corresponds to one dot product
-// in the convolution
-void ConvolutionComponent::InderivPatchesToInderiv(
-    const CuMatrix<BaseFloat>& in_deriv_patches,
-    CuMatrixBase<BaseFloat> *in_deriv) const {
-
-  const int32 num_x_steps = (1 + (input_x_dim_ - filt_x_dim_) / filt_x_step_),
-              num_y_steps = (1 + (input_y_dim_ - filt_y_dim_) / filt_y_step_),
-              filt_x_step = filt_x_step_,
-              filt_y_step = filt_y_step_,
-              filt_x_dim = filt_x_dim_,
-              filt_y_dim = filt_y_dim_,
-              input_x_dim = input_x_dim_,
-              input_y_dim = input_y_dim_,
-              input_z_dim = input_z_dim_,
-              filter_dim = filter_params_.NumCols();
-
-  // Compute the reverse column_map from the matrix with input
-  // derivative patches to input derivative matrix
-  std::vector<std::vector<int32> > reverse_column_map(in_deriv->NumCols());
-  int32 rev_col_map_size = reverse_column_map.size();
-  for (int32 x_step = 0; x_step < num_x_steps; x_step++) {
-    for (int32 y_step = 0; y_step < num_y_steps; y_step++)  {
-      int32 patch_number = x_step * num_y_steps + y_step;
-      int32 patch_start_index = patch_number * filter_dim;
-      for (int32 x = 0, index = patch_start_index; x < filt_x_dim; x++)  {
-        for (int32 y = 0; y < filt_y_dim; y++)  {
-          for (int32 z = 0; z < input_z_dim; z++, index++)  {
-            int32 vector_index;
-            if (input_vectorization_ == kZyx)  {
-              vector_index = ZyxVectorIndex(x_step * filt_x_step + x,
-                                            y_step * filt_y_step + y, z,
-                                            input_x_dim, input_y_dim,
-                                            input_z_dim);
-            } else {
-              KALDI_ASSERT(input_vectorization_ == kYzx);
-              vector_index = YzxVectorIndex(x_step * filt_x_step + x,
-                                            y_step * filt_y_step + y, z,
-                                            input_x_dim, input_y_dim,
-                                            input_z_dim);
-            }
-            KALDI_ASSERT(vector_index < rev_col_map_size);
-            reverse_column_map[vector_index].push_back(index);
-          }
-        }
-      }
-    }
-  }
-  std::vector<std::vector<int32> > rearranged_column_map;
-  RearrangeIndexes(reverse_column_map, &rearranged_column_map);
-  for (int32 p = 0; p < rearranged_column_map.size(); p++) {
-    CuArray<int32> cu_cols(rearranged_column_map[p]);
-    in_deriv->AddCols(in_deriv_patches, cu_cols);
-  }
-}
-
-// back propagation function
-// see function declaration in nnet-simple-component.h for details
-void ConvolutionComponent::Backprop(const std::string &debug_info,
-                                    const ComponentPrecomputedIndexes *indexes,
-                                    const CuMatrixBase<BaseFloat> &in_value,
-                                    const CuMatrixBase<BaseFloat> &, // out_value,
-                                    const CuMatrixBase<BaseFloat> &out_deriv,
-                                    void *memo,
-                                    Component *to_update_in,
-                                    CuMatrixBase<BaseFloat> *in_deriv) const {
-  ConvolutionComponent *to_update =
-      dynamic_cast<ConvolutionComponent*>(to_update_in);
-  const int32 num_x_steps = (1 + (input_x_dim_ - filt_x_dim_) / filt_x_step_),
-              num_y_steps = (1 + (input_y_dim_ - filt_y_dim_) / filt_y_step_),
-              num_filters = filter_params_.NumRows(),
-              num_frames = out_deriv.NumRows(),
-              filter_dim = filter_params_.NumCols();
-
-  KALDI_ASSERT(out_deriv.NumRows() == num_frames &&
-               out_deriv.NumCols() ==
-               (num_filters * num_x_steps * num_y_steps));
-
-  // Compute inderiv patches
-  CuMatrix<BaseFloat> in_deriv_patches(num_frames,
-                                       num_x_steps * num_y_steps * filter_dim,
-                                       kSetZero);
-
-  std::vector<CuSubMatrix<BaseFloat>* > patch_deriv_batch, out_deriv_batch,
-      filter_params_batch;
-  CuSubMatrix<BaseFloat>* filter_params_elem = new CuSubMatrix<BaseFloat>(
-      filter_params_, 0, filter_params_.NumRows(), 0, filter_params_.NumCols());
-
-  for (int32 x_step = 0; x_step < num_x_steps; x_step++)  {
-    for (int32 y_step = 0; y_step < num_y_steps; y_step++)  {
-      int32 patch_number = x_step * num_y_steps + y_step;
-
-      patch_deriv_batch.push_back(new CuSubMatrix<BaseFloat>(
-              in_deriv_patches.ColRange(
-              patch_number * filter_dim, filter_dim)));
-      out_deriv_batch.push_back(new CuSubMatrix<BaseFloat>(out_deriv.ColRange(
-              patch_number * num_filters, num_filters)));
-      filter_params_batch.push_back(filter_params_elem);
-    }
-  }
-  AddMatMatBatched<BaseFloat>(1.0, patch_deriv_batch,
-                              out_deriv_batch, kNoTrans,
-                              filter_params_batch, kNoTrans, 0.0);
-
-  if (in_deriv) {
-    // combine the derivatives from the individual input deriv patches
-    // to compute input deriv matrix
-    InderivPatchesToInderiv(in_deriv_patches, in_deriv);
-  }
-
-  if (to_update != NULL)  {
-    to_update->Update(debug_info, in_value, out_deriv, out_deriv_batch);
-  }
-
-  // release memory
-  delete filter_params_elem;
-  for (int32 p = 0; p < patch_deriv_batch.size(); p++) {
-    delete patch_deriv_batch[p];
-    delete out_deriv_batch[p];
-  }
-}
-
-
-// update parameters
-// see function declaration in nnet-simple-component.h for details
-void ConvolutionComponent::Update(const std::string &debug_info,
-                                  const CuMatrixBase<BaseFloat> &in_value,
-                                  const CuMatrixBase<BaseFloat> &out_deriv,
-                                  const std::vector<CuSubMatrix<BaseFloat> *>& out_deriv_batch) {
-  // useful dims
-  const int32 num_x_steps = (1 + (input_x_dim_ - filt_x_dim_) / filt_x_step_),
-              num_y_steps = (1 + (input_y_dim_ - filt_y_dim_) / filt_y_step_),
-              num_filters = filter_params_.NumRows(),
-              num_frames = out_deriv.NumRows(),
-              filter_dim = filter_params_.NumCols();
-  KALDI_ASSERT(out_deriv.NumRows() == num_frames &&
-               out_deriv.NumCols() ==
-               (num_filters * num_x_steps * num_y_steps));
-
-
-  CuMatrix<BaseFloat> filters_grad;
-  CuVector<BaseFloat> bias_grad;
-
-  CuMatrix<BaseFloat> input_patches(num_frames,
-                                    filter_dim * num_x_steps * num_y_steps,
-                                    kUndefined);
-  InputToInputPatches(in_value, &input_patches);
-
-  filters_grad.Resize(num_filters, filter_dim, kSetZero); // reset
-  bias_grad.Resize(num_filters, kSetZero); // reset
-
-  // create a single large matrix holding the smaller matrices
-  // from the vector container filters_grad_batch along the rows
-  CuMatrix<BaseFloat> filters_grad_blocks_batch(
-      num_x_steps * num_y_steps * filters_grad.NumRows(),
-      filters_grad.NumCols());
-
-  std::vector<CuSubMatrix<BaseFloat>* > filters_grad_batch, input_patch_batch;
-
-  for (int32 x_step = 0; x_step < num_x_steps; x_step++)  {
-    for (int32 y_step = 0; y_step < num_y_steps; y_step++)  {
-      int32 patch_number = x_step * num_y_steps + y_step;
-      filters_grad_batch.push_back(new CuSubMatrix<BaseFloat>(
-          filters_grad_blocks_batch.RowRange(
-              patch_number * filters_grad.NumRows(), filters_grad.NumRows())));
-
-      input_patch_batch.push_back(new CuSubMatrix<BaseFloat>(
-              input_patches.ColRange(patch_number * filter_dim, filter_dim)));
-    }
-  }
-
-  AddMatMatBatched<BaseFloat>(1.0, filters_grad_batch, out_deriv_batch, kTrans,
-                              input_patch_batch, kNoTrans, 1.0);
-
-  // add the row blocks together to filters_grad
-  filters_grad.AddMatBlocks(1.0, filters_grad_blocks_batch);
-
-  // create a matrix holding the col blocks sum of out_deriv
-  CuMatrix<BaseFloat> out_deriv_col_blocks_sum(out_deriv.NumRows(),
-                                               num_filters);
-
-  // add the col blocks together to out_deriv_col_blocks_sum
-  out_deriv_col_blocks_sum.AddMatBlocks(1.0, out_deriv);
-
-  bias_grad.AddRowSumMat(1.0, out_deriv_col_blocks_sum, 1.0);
-
-  // release memory
-  for (int32 p = 0; p < input_patch_batch.size(); p++) {
-    delete filters_grad_batch[p];
-    delete input_patch_batch[p];
-  }
-
-  //
-  // update
-  //
-  filter_params_.AddMat(learning_rate_, filters_grad);
-  bias_params_.AddVec(learning_rate_, bias_grad);
-}
-
-void ConvolutionComponent::Read(std::istream &is, bool binary) {
-  ReadUpdatableCommon(is, binary);  // Read opening tag and learning rate.
-  ExpectToken(is, binary, "<InputXDim>");
-  ReadBasicType(is, binary, &input_x_dim_);
-  ExpectToken(is, binary, "<InputYDim>");
-  ReadBasicType(is, binary, &input_y_dim_);
-  ExpectToken(is, binary, "<InputZDim>");
-  ReadBasicType(is, binary, &input_z_dim_);
-  ExpectToken(is, binary, "<FiltXDim>");
-  ReadBasicType(is, binary, &filt_x_dim_);
-  ExpectToken(is, binary, "<FiltYDim>");
-  ReadBasicType(is, binary, &filt_y_dim_);
-  ExpectToken(is, binary, "<FiltXStep>");
-  ReadBasicType(is, binary, &filt_x_step_);
-  ExpectToken(is, binary, "<FiltYStep>");
-  ReadBasicType(is, binary, &filt_y_step_);
-  ExpectToken(is, binary, "<InputVectorization>");
-  int32 input_vectorization;
-  ReadBasicType(is, binary, &input_vectorization);
-  input_vectorization_ = static_cast<TensorVectorizationType>(input_vectorization);
-  ExpectToken(is, binary, "<FilterParams>");
-  filter_params_.Read(is, binary);
-  ExpectToken(is, binary, "<BiasParams>");
-  bias_params_.Read(is, binary);
-  std::string tok;
-  ReadToken(is, binary, &tok);
-  if (tok == "<IsGradient>") {
-    ReadBasicType(is, binary, &is_gradient_);
-    ExpectToken(is, binary, "</ConvolutionComponent>");
-  } else {
-    is_gradient_ = false;
-    KALDI_ASSERT(tok == "</ConvolutionComponent>");
-  }
-}
-
-void ConvolutionComponent::Write(std::ostream &os, bool binary) const {
-  WriteUpdatableCommon(os, binary);  // write opening tag and learning rate.
-  WriteToken(os, binary, "<InputXDim>");
-  WriteBasicType(os, binary, input_x_dim_);
-  WriteToken(os, binary, "<InputYDim>");
-  WriteBasicType(os, binary, input_y_dim_);
-  WriteToken(os, binary, "<InputZDim>");
-  WriteBasicType(os, binary, input_z_dim_);
-  WriteToken(os, binary, "<FiltXDim>");
-  WriteBasicType(os, binary, filt_x_dim_);
-  WriteToken(os, binary, "<FiltYDim>");
-  WriteBasicType(os, binary, filt_y_dim_);
-  WriteToken(os, binary, "<FiltXStep>");
-  WriteBasicType(os, binary, filt_x_step_);
-  WriteToken(os, binary, "<FiltYStep>");
-  WriteBasicType(os, binary, filt_y_step_);
-  WriteToken(os, binary, "<InputVectorization>");
-  WriteBasicType(os, binary, static_cast<int32>(input_vectorization_));
-  WriteToken(os, binary, "<FilterParams>");
-  filter_params_.Write(os, binary);
-  WriteToken(os, binary, "<BiasParams>");
-  bias_params_.Write(os, binary);
-  WriteToken(os, binary, "<IsGradient>");
-  WriteBasicType(os, binary, is_gradient_);
-  WriteToken(os, binary, "</ConvolutionComponent>");
-}
-
-BaseFloat ConvolutionComponent::DotProduct(const UpdatableComponent &other_in) const {
-  const ConvolutionComponent *other =
-      dynamic_cast<const ConvolutionComponent*>(&other_in);
-  return TraceMatMat(filter_params_, other->filter_params_, kTrans)
-         + VecVec(bias_params_, other->bias_params_);
-}
-
-Component* ConvolutionComponent::Copy() const {
-  ConvolutionComponent *ans = new ConvolutionComponent(*this);
-  return ans;
-}
-
-void ConvolutionComponent::PerturbParams(BaseFloat stddev) {
-  CuMatrix<BaseFloat> temp_filter_params(filter_params_);
-  temp_filter_params.SetRandn();
-  filter_params_.AddMat(stddev, temp_filter_params);
-
-  CuVector<BaseFloat> temp_bias_params(bias_params_);
-  temp_bias_params.SetRandn();
-  bias_params_.AddVec(stddev, temp_bias_params);
-}
-
-void ConvolutionComponent::SetParams(const VectorBase<BaseFloat> &bias,
-                                     const MatrixBase<BaseFloat> &filter) {
-  bias_params_ = bias;
-  filter_params_ = filter;
-  KALDI_ASSERT(bias_params_.Dim() == filter_params_.NumRows());
-}
-
-int32 ConvolutionComponent::NumParameters() const {
-  return (filter_params_.NumCols() + 1) * filter_params_.NumRows();
-}
-
-void ConvolutionComponent::Vectorize(VectorBase<BaseFloat> *params) const {
-  KALDI_ASSERT(params->Dim() == this->NumParameters());
-  int32 num_filter_params = filter_params_.NumCols() * filter_params_.NumRows();
-  params->Range(0, num_filter_params).CopyRowsFromMat(filter_params_);
-  params->Range(num_filter_params, bias_params_.Dim()).CopyFromVec(bias_params_);
-}
-void ConvolutionComponent::UnVectorize(const VectorBase<BaseFloat> &params) {
-  KALDI_ASSERT(params.Dim() == this->NumParameters());
-  int32 num_filter_params = filter_params_.NumCols() * filter_params_.NumRows();
-  filter_params_.CopyRowsFromVec(params.Range(0, num_filter_params));
-  bias_params_.CopyFromVec(params.Range(num_filter_params, bias_params_.Dim()));
-}
-
-// aquire input dim
-int32 MaxpoolingComponent::InputDim() const {
-  return input_x_dim_ * input_y_dim_ * input_z_dim_;
-}
-
-MaxpoolingComponent::MaxpoolingComponent(
-    const MaxpoolingComponent &component):
-    input_x_dim_(component.input_x_dim_),
-    input_y_dim_(component.input_y_dim_),
-    input_z_dim_(component.input_z_dim_),
-    pool_x_size_(component.pool_x_size_),
-    pool_y_size_(component.pool_y_size_),
-    pool_z_size_(component.pool_z_size_),
-    pool_x_step_(component.pool_x_step_),
-    pool_y_step_(component.pool_y_step_),
-    pool_z_step_(component.pool_z_step_) { }
-
-// aquire output dim
-int32 MaxpoolingComponent::OutputDim() const {
-  int32 num_pools_x = 1 + (input_x_dim_ - pool_x_size_) / pool_x_step_;
-  int32 num_pools_y = 1 + (input_y_dim_ - pool_y_size_) / pool_y_step_;
-  int32 num_pools_z = 1 + (input_z_dim_ - pool_z_size_) / pool_z_step_;
-  return num_pools_x * num_pools_y * num_pools_z;
-}
-
-// check the component parameters
-void MaxpoolingComponent::Check() const {
-  // sanity check of the max pooling parameters
-  KALDI_ASSERT(input_x_dim_ > 0);
-  KALDI_ASSERT(input_y_dim_ > 0);
-  KALDI_ASSERT(input_z_dim_ > 0);
-  KALDI_ASSERT(pool_x_size_ > 0);
-  KALDI_ASSERT(pool_y_size_ > 0);
-  KALDI_ASSERT(pool_z_size_ > 0);
-  KALDI_ASSERT(pool_x_step_ > 0);
-  KALDI_ASSERT(pool_y_step_ > 0);
-  KALDI_ASSERT(pool_z_step_ > 0);
-  KALDI_ASSERT(input_x_dim_ >= pool_x_size_);
-  KALDI_ASSERT(input_y_dim_ >= pool_y_size_);
-  KALDI_ASSERT(input_z_dim_ >= pool_z_size_);
-  KALDI_ASSERT(pool_x_size_ >= pool_x_step_);
-  KALDI_ASSERT(pool_y_size_ >= pool_y_step_);
-  KALDI_ASSERT(pool_z_size_ >= pool_z_step_);
-  KALDI_ASSERT((input_x_dim_ - pool_x_size_) % pool_x_step_  == 0);
-  KALDI_ASSERT((input_y_dim_ - pool_y_size_) % pool_y_step_  == 0);
-  KALDI_ASSERT((input_z_dim_ - pool_z_size_) % pool_z_step_  == 0);
-}
-
-// initialize the component using configuration file
-void MaxpoolingComponent::InitFromConfig(ConfigLine *cfl) {
-  bool ok = true;
-
-  ok = ok && cfl->GetValue("input-x-dim", &input_x_dim_);
-  ok = ok && cfl->GetValue("input-y-dim", &input_y_dim_);
-  ok = ok && cfl->GetValue("input-z-dim", &input_z_dim_);
-  ok = ok && cfl->GetValue("pool-x-size", &pool_x_size_);
-  ok = ok && cfl->GetValue("pool-y-size", &pool_y_size_);
-  ok = ok && cfl->GetValue("pool-z-size", &pool_z_size_);
-  ok = ok && cfl->GetValue("pool-x-step", &pool_x_step_);
-  ok = ok && cfl->GetValue("pool-y-step", &pool_y_step_);
-  ok = ok && cfl->GetValue("pool-z-step", &pool_z_step_);
-
-  if (cfl->HasUnusedValues())
-    KALDI_ERR << "Could not process these elements in initializer: "
-              << cfl->UnusedValues();
-  if (!ok)
-    KALDI_ERR << "Bad initializer " << cfl->WholeLine();
-
-  Check();
-}
-
-// Method to convert from a matrix representing a minibatch of vectorized
-// 3D tensors to patches for 3d max pooling, each patch corresponds to
-// the nodes having the same local coordinatenodes from each pool
-void MaxpoolingComponent::InputToInputPatches(
-    const CuMatrixBase<BaseFloat>& in,
-    CuMatrix<BaseFloat> *patches) const{
-  int32 num_pools_x = 1 + (input_x_dim_ - pool_x_size_) / pool_x_step_;
-  int32 num_pools_y = 1 + (input_y_dim_ - pool_y_size_) / pool_y_step_;
-  int32 num_pools_z = 1 + (input_z_dim_ - pool_z_size_) / pool_z_step_;
-
-  std::vector<int32> column_map(patches->NumCols());
-  int32 column_map_size = column_map.size();
-  for (int32 x = 0, index =0; x < pool_x_size_; x++) {
-    for (int32 y = 0; y < pool_y_size_; y++) {
-      for (int32 z = 0; z < pool_z_size_; z++) {
-        // given the local node coordinate, group them from each pool
-        // to form a patch
-        for (int32 x_pool = 0; x_pool < num_pools_x; x_pool++) {
-          for (int32 y_pool = 0; y_pool < num_pools_y; y_pool++) {
-            for (int32 z_pool = 0; z_pool < num_pools_z; z_pool++, index++) {
-              KALDI_ASSERT(index < column_map_size);
-              column_map[index] = (x_pool * pool_x_step_ + x) * input_y_dim_ * input_z_dim_ +
-                                  (y_pool * pool_y_step_ + y) * input_z_dim_ +
-                                  (z_pool * pool_z_step_ + z);
-
-            }
-          }
-        }
-      }
-    }
-  }
-  CuArray<int32> cu_cols(column_map);
-  patches->CopyCols(in, cu_cols);
-}
-
-/*
-  This is the 3d max pooling propagate function.
-  It is assumed that each row of the input matrix
-  is a vectorized 3D-tensor of type zxy.
-  Similar to the propagate function of ConvolutionComponent,
-  the input matrix is first arranged into patches so that
-  pools (with / without overlapping) could be
-  processed in a parallelizable manner.
-  The output matrix is also a vectorized 3D-tensor of type zxy.
-*/
-
-void* MaxpoolingComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
-                                    const CuMatrixBase<BaseFloat> &in,
-                                    CuMatrixBase<BaseFloat> *out) const {
-  int32 num_frames = in.NumRows();
-  int32 num_pools = OutputDim();
-  int32 pool_size = pool_x_size_ * pool_y_size_ * pool_z_size_;
-  CuMatrix<BaseFloat> patches(num_frames, num_pools * pool_size, kUndefined);
-  InputToInputPatches(in, &patches);
-
-  out->Set(-1e20); // reset a large negative value
-  for (int32 q = 0; q < pool_size; q++)
-    out->Max(patches.ColRange(q * num_pools, num_pools));
-  return NULL;
-}
-
-// Method to compute the input derivative matrix from the input derivatives
-// for patches, where each patch corresponds to
-// the nodes having the same local coordinatenodes from each pool
-void MaxpoolingComponent::InderivPatchesToInderiv(
-    const CuMatrix<BaseFloat>& in_deriv_patches,
-    CuMatrixBase<BaseFloat> *in_deriv) const {
-
-  int32 num_pools_x = 1 + (input_x_dim_ - pool_x_size_) / pool_x_step_;
-  int32 num_pools_y = 1 + (input_y_dim_ - pool_y_size_) / pool_y_step_;
-  int32 num_pools_z = 1 + (input_z_dim_ - pool_z_size_) / pool_z_step_;
-
-  std::vector<std::vector<int32> > reverse_column_map(in_deriv->NumCols());
-  int32 rev_col_map_size = reverse_column_map.size();
-  for (int32 x = 0, index = 0; x < pool_x_size_; x++) {
-    for (int32 y = 0; y < pool_y_size_; y++) {
-      for (int32 z = 0; z < pool_z_size_; z++) {
-
-        for (int32 x_pool = 0; x_pool < num_pools_x; x_pool++) {
-          for (int32 y_pool = 0; y_pool < num_pools_y; y_pool++) {
-            for (int32 z_pool = 0; z_pool < num_pools_z; z_pool++, index++) {
-              int32 vector_index = (x_pool * pool_x_step_ + x) * input_y_dim_ * input_z_dim_ +
-                                  (y_pool * pool_y_step_ + y) * input_z_dim_ +
-                                  (z_pool * pool_z_step_ + z);
-
-              KALDI_ASSERT(vector_index < rev_col_map_size);
-              reverse_column_map[vector_index].push_back(index);
-            }
-          }
-        }
-      }
-    }
-  }
-  std::vector<std::vector<int32> > rearranged_column_map;
-  RearrangeIndexes(reverse_column_map, &rearranged_column_map);
-  for (int32 p = 0; p < rearranged_column_map.size(); p++) {
-    CuArray<int32> cu_cols(rearranged_column_map[p]);
-    in_deriv->AddCols(in_deriv_patches, cu_cols);
-  }
-}
-
-/*
-  3d max pooling backpropagate function
-  This function backpropagate the error from
-  out_deriv to in_deriv.
-  In order to select the node in each pool to
-  backpropagate the error, it has to compare
-  the output pool value stored in the out_value
-  matrix with each of its input pool member node
-  stroed in the in_value matrix.
-*/
-void MaxpoolingComponent::Backprop(const std::string &debug_info,
-                                   const ComponentPrecomputedIndexes *indexes,
-                                   const CuMatrixBase<BaseFloat> &in_value,
-                                   const CuMatrixBase<BaseFloat> &out_value,
-                                   const CuMatrixBase<BaseFloat> &out_deriv,
-                                   void *memo,
-                                   Component *, // to_update,
-                                   CuMatrixBase<BaseFloat> *in_deriv) const {
-  if (!in_deriv)
-    return;
-
-  int32 num_frames = in_value.NumRows();
-  int32 num_pools = OutputDim();
-  int32 pool_size = pool_x_size_ * pool_y_size_ * pool_z_size_;
-  CuMatrix<BaseFloat> patches(num_frames, num_pools * pool_size, kUndefined);
-  InputToInputPatches(in_value, &patches);
-
-  for (int32 q = 0; q < pool_size; q++) {
-    // zero-out mask
-    CuMatrix<BaseFloat> mask;
-    out_value.EqualElementMask(patches.ColRange(q * num_pools, num_pools), &mask);
-    mask.MulElements(out_deriv);
-    patches.ColRange(q * num_pools, num_pools).CopyFromMat(mask);
-  }
-
-  // combine the derivatives from the individual input deriv patches
-  // to compute input deriv matrix
-  InderivPatchesToInderiv(patches, in_deriv);
-}
-
-void MaxpoolingComponent::Read(std::istream &is, bool binary) {
-  ExpectOneOrTwoTokens(is, binary, "<MaxpoolingComponent>", "<InputXDim>");
-  ReadBasicType(is, binary, &input_x_dim_);
-  ExpectToken(is, binary, "<InputYDim>");
-  ReadBasicType(is, binary, &input_y_dim_);
-  ExpectToken(is, binary, "<InputZDim>");
-  ReadBasicType(is, binary, &input_z_dim_);
-  ExpectToken(is, binary, "<PoolXSize>");
-  ReadBasicType(is, binary, &pool_x_size_);
-  ExpectToken(is, binary, "<PoolYSize>");
-  ReadBasicType(is, binary, &pool_y_size_);
-  ExpectToken(is, binary, "<PoolZSize>");
-  ReadBasicType(is, binary, &pool_z_size_);
-  ExpectToken(is, binary, "<PoolXStep>");
-  ReadBasicType(is, binary, &pool_x_step_);
-  ExpectToken(is, binary, "<PoolYStep>");
-  ReadBasicType(is, binary, &pool_y_step_);
-  ExpectToken(is, binary, "<PoolZStep>");
-  ReadBasicType(is, binary, &pool_z_step_);
-  ExpectToken(is, binary, "</MaxpoolingComponent>");
-  Check();
-}
-
-void MaxpoolingComponent::Write(std::ostream &os, bool binary) const {
-  WriteToken(os, binary, "<MaxpoolingComponent>");
-  WriteToken(os, binary, "<InputXDim>");
-  WriteBasicType(os, binary, input_x_dim_);
-  WriteToken(os, binary, "<InputYDim>");
-  WriteBasicType(os, binary, input_y_dim_);
-  WriteToken(os, binary, "<InputZDim>");
-  WriteBasicType(os, binary, input_z_dim_);
-  WriteToken(os, binary, "<PoolXSize>");
-  WriteBasicType(os, binary, pool_x_size_);
-  WriteToken(os, binary, "<PoolYSize>");
-  WriteBasicType(os, binary, pool_y_size_);
-  WriteToken(os, binary, "<PoolZSize>");
-  WriteBasicType(os, binary, pool_z_size_);
-  WriteToken(os, binary, "<PoolXStep>");
-  WriteBasicType(os, binary, pool_x_step_);
-  WriteToken(os, binary, "<PoolYStep>");
-  WriteBasicType(os, binary, pool_y_step_);
-  WriteToken(os, binary, "<PoolZStep>");
-  WriteBasicType(os, binary, pool_z_step_);
-  WriteToken(os, binary, "</MaxpoolingComponent>");
-}
-
-// display information about component
-std::string MaxpoolingComponent::Info() const {
-  std::ostringstream stream;
-  stream << Type()
-         << ", input-x-dim=" << input_x_dim_
-         << ", input-y-dim=" << input_y_dim_
-         << ", input-z-dim=" << input_z_dim_
-         << ", pool-x-size=" << pool_x_size_
-         << ", pool-y-size=" << pool_y_size_
-         << ", pool-z-size=" << pool_z_size_
-         << ", pool-x-step=" << pool_x_step_
-         << ", pool-y-step=" << pool_y_step_
-         << ", pool-z-step=" << pool_z_step_;
-  return stream.str();
-}
-
 void PermuteComponent::ComputeReverseColumnMap() {
   int32 dim = column_map_.Dim();
   KALDI_ASSERT(dim > 0);
@@ -5550,371 +4617,6 @@ void CompositeComponent::SetComponent(int32 i, Component *component) {
   components_[i] = component;
 }
 
-int32 LstmNonlinearityComponent::InputDim() const {
-  int32 cell_dim = value_sum_.NumCols();
-  return cell_dim * 5 + (use_dropout_ ? 3 : 0);
-}
-
-int32 LstmNonlinearityComponent::OutputDim() const {
-  int32 cell_dim = value_sum_.NumCols();
-  return cell_dim * 2;
-}
-
-
-void LstmNonlinearityComponent::Read(std::istream &is, bool binary) {
-  ReadUpdatableCommon(is, binary);  // Read opening tag and learning rate.
-  ExpectToken(is, binary, "<Params>");
-  params_.Read(is, binary);
-  ExpectToken(is, binary, "<ValueAvg>");
-  value_sum_.Read(is, binary);
-  ExpectToken(is, binary, "<DerivAvg>");
-  deriv_sum_.Read(is, binary);
-  ExpectToken(is, binary, "<SelfRepairConfig>");
-  self_repair_config_.Read(is, binary);
-  ExpectToken(is, binary, "<SelfRepairProb>");
-  self_repair_total_.Read(is, binary);
-
-  std::string tok;
-  ReadToken(is, binary, &tok);
-  if (tok == "<UseDropout>") {
-    ReadBasicType(is, binary, &use_dropout_);
-    ReadToken(is, binary, &tok);
-  } else {
-    use_dropout_ = false;
-  }
-  KALDI_ASSERT(tok == "<Count>");
-  ReadBasicType(is, binary, &count_);
-
-  // For the on-disk format, we normalze value_sum_, deriv_sum_ and
-  // self_repair_total_ by dividing by the count, but in memory they are scaled
-  // by the count.  [for self_repair_total_, the scaling factor is count_ *
-  // cell_dim].
-  value_sum_.Scale(count_);
-  deriv_sum_.Scale(count_);
-  int32 cell_dim = params_.NumCols();
-  self_repair_total_.Scale(count_ * cell_dim);
-
-  InitNaturalGradient();
-
-  ExpectToken(is, binary, "</LstmNonlinearityComponent>");
-
-}
-
-void LstmNonlinearityComponent::Write(std::ostream &os, bool binary) const {
-  WriteUpdatableCommon(os, binary);  // Read opening tag and learning rate.
-
-  WriteToken(os, binary, "<Params>");
-  params_.Write(os, binary);
-  WriteToken(os, binary, "<ValueAvg>");
-  {
-    Matrix<BaseFloat> value_avg(value_sum_);
-    if (count_ != 0.0)
-      value_avg.Scale(1.0 / count_);
-    value_avg.Write(os, binary);
-  }
-  WriteToken(os, binary, "<DerivAvg>");
-  {
-    Matrix<BaseFloat> deriv_avg(deriv_sum_);
-    if (count_ != 0.0)
-      deriv_avg.Scale(1.0 / count_);
-    deriv_avg.Write(os, binary);
-  }
-  WriteToken(os, binary, "<SelfRepairConfig>");
-  self_repair_config_.Write(os, binary);
-  WriteToken(os, binary, "<SelfRepairProb>");
-  {
-    int32 cell_dim = params_.NumCols();
-    Vector<BaseFloat> self_repair_prob(self_repair_total_);
-    if (count_ != 0.0)
-      self_repair_prob.Scale(1.0 / (count_ * cell_dim));
-    self_repair_prob.Write(os, binary);
-  }
-  if (use_dropout_) {
-    // only write this if true; we have back-compat code in reading anyway.
-    // this makes the models without dropout easier to read with older code.
-    WriteToken(os, binary, "<UseDropout>");
-    WriteBasicType(os, binary, use_dropout_);
-  }
-  WriteToken(os, binary, "<Count>");
-  WriteBasicType(os, binary, count_);
-  WriteToken(os, binary, "</LstmNonlinearityComponent>");
-}
-
-
-
-std::string LstmNonlinearityComponent::Info() const {
-  std::ostringstream stream;
-  int32 cell_dim = params_.NumCols();
-  stream << UpdatableComponent::Info() << ", cell-dim=" << cell_dim
-         << ", use-dropout=" << (use_dropout_ ? "true" : "false");
-  PrintParameterStats(stream, "w_ic", params_.Row(0));
-  PrintParameterStats(stream, "w_fc", params_.Row(1));
-  PrintParameterStats(stream, "w_oc", params_.Row(2));
-
-  // Note: some of the following code mirrors the code in
-  // UpdatableComponent::Info(), in nnet-component-itf.cc.
-  if (count_ > 0) {
-    stream << ", count=" << std::setprecision(3) << count_
-           << std::setprecision(6);
-  }
-  static const char *nonlin_names[] = { "i_t_sigmoid", "f_t_sigmoid", "c_t_tanh",
-                                        "o_t_sigmoid", "m_t_tanh" };
-  for (int32 i = 0; i < 5; i++) {
-    stream << ", " << nonlin_names[i] << "={";
-    stream << " self-repair-lower-threshold=" << self_repair_config_(i)
-           << ", self-repair-scale=" << self_repair_config_(i + 5);
-
-    if (count_ != 0) {
-      BaseFloat self_repaired_proportion =
-          self_repair_total_(i) / (count_ * cell_dim);
-      stream << ", self-repaired-proportion=" << self_repaired_proportion;
-      Vector<double> value_sum(value_sum_.Row(i)),
-          deriv_sum(deriv_sum_.Row(i));
-      Vector<BaseFloat> value_avg(value_sum), deriv_avg(deriv_sum);
-      value_avg.Scale(1.0 / count_);
-      deriv_avg.Scale(1.0 / count_);
-      stream << ", value-avg=" << SummarizeVector(value_avg)
-             << ", deriv-avg=" << SummarizeVector(deriv_avg);
-    }
-    stream << " }";
-  }
-  return stream.str();
-}
-
-
-Component* LstmNonlinearityComponent::Copy() const {
-  return new LstmNonlinearityComponent(*this);
-}
-
-void LstmNonlinearityComponent::ZeroStats() {
-  value_sum_.SetZero();
-  deriv_sum_.SetZero();
-  self_repair_total_.SetZero();
-  count_ = 0.0;
-}
-
-void LstmNonlinearityComponent::Scale(BaseFloat scale) {
-  if (scale == 0.0) {
-    params_.SetZero();
-    value_sum_.SetZero();
-    deriv_sum_.SetZero();
-    self_repair_total_.SetZero();
-    count_ = 0.0;
-  } else {
-    params_.Scale(scale);
-    value_sum_.Scale(scale);
-    deriv_sum_.Scale(scale);
-    self_repair_total_.Scale(scale);
-    count_ *= scale;
-  }
-}
-
-void LstmNonlinearityComponent::Add(BaseFloat alpha,
-                                    const Component &other_in) {
-  const LstmNonlinearityComponent *other =
-      dynamic_cast<const LstmNonlinearityComponent*>(&other_in);
-  KALDI_ASSERT(other != NULL);
-  params_.AddMat(alpha, other->params_);
-  value_sum_.AddMat(alpha, other->value_sum_);
-  deriv_sum_.AddMat(alpha, other->deriv_sum_);
-  self_repair_total_.AddVec(alpha, other->self_repair_total_);
-  count_ += alpha * other->count_;
-}
-
-void LstmNonlinearityComponent::PerturbParams(BaseFloat stddev) {
-  CuMatrix<BaseFloat> temp_params(params_.NumRows(), params_.NumCols());
-  temp_params.SetRandn();
-  params_.AddMat(stddev, temp_params);
-}
-
-BaseFloat LstmNonlinearityComponent::DotProduct(
-    const UpdatableComponent &other_in) const {
-  const LstmNonlinearityComponent *other =
-      dynamic_cast<const LstmNonlinearityComponent*>(&other_in);
-  KALDI_ASSERT(other != NULL);
-  return TraceMatMat(params_, other->params_, kTrans);
-}
-
-int32 LstmNonlinearityComponent::NumParameters() const {
-  return params_.NumRows() * params_.NumCols();
-}
-
-void LstmNonlinearityComponent::Vectorize(VectorBase<BaseFloat> *params) const {
-  KALDI_ASSERT(params->Dim() == NumParameters());
-  params->CopyRowsFromMat(params_);
-}
-
-
-void LstmNonlinearityComponent::UnVectorize(
-    const VectorBase<BaseFloat> &params)  {
-  KALDI_ASSERT(params.Dim() == NumParameters());
-  params_.CopyRowsFromVec(params);
-}
-
-
-void* LstmNonlinearityComponent::Propagate(
-    const ComponentPrecomputedIndexes *, // indexes
-    const CuMatrixBase<BaseFloat> &in,
-    CuMatrixBase<BaseFloat> *out) const {
-  cu::ComputeLstmNonlinearity(in, params_, out);
-  return NULL;
-}
-
-
-void LstmNonlinearityComponent::Backprop(
-    const std::string &debug_info,
-    const ComponentPrecomputedIndexes *indexes,
-    const CuMatrixBase<BaseFloat> &in_value,
-    const CuMatrixBase<BaseFloat> &, // out_value,
-    const CuMatrixBase<BaseFloat> &out_deriv,
-    void *memo,
-    Component *to_update_in,
-    CuMatrixBase<BaseFloat> *in_deriv) const {
-
-  if (to_update_in == NULL) {
-    cu::BackpropLstmNonlinearity(in_value, params_, out_deriv,
-                                 deriv_sum_, self_repair_config_,
-                                 count_, in_deriv,
-                                 (CuMatrixBase<BaseFloat>*) NULL,
-                                 (CuMatrixBase<double>*) NULL,
-                                 (CuMatrixBase<double>*) NULL,
-                                 (CuMatrixBase<BaseFloat>*) NULL);
-  } else {
-    LstmNonlinearityComponent *to_update =
-        dynamic_cast<LstmNonlinearityComponent*>(to_update_in);
-    KALDI_ASSERT(to_update != NULL);
-
-    int32 cell_dim = params_.NumCols();
-    CuMatrix<BaseFloat> params_deriv(3, cell_dim, kUndefined);
-    CuMatrix<BaseFloat> self_repair_total(5, cell_dim, kUndefined);
-
-    cu::BackpropLstmNonlinearity(in_value, params_, out_deriv,
-                                 deriv_sum_, self_repair_config_,
-                                 count_, in_deriv, &params_deriv,
-                                 &(to_update->value_sum_),
-                                 &(to_update->deriv_sum_),
-                                 &self_repair_total);
-
-    CuVector<BaseFloat> self_repair_total_sum(5);
-    self_repair_total_sum.AddColSumMat(1.0, self_repair_total, 0.0);
-    to_update->self_repair_total_.AddVec(1.0, self_repair_total_sum);
-    to_update->count_ += static_cast<double>(in_value.NumRows());
-
-    BaseFloat scale = 1.0;
-    if (!to_update->is_gradient_) {
-      to_update->preconditioner_.PreconditionDirections(
-          &params_deriv, &scale);
-    }
-    to_update->params_.AddMat(to_update->learning_rate_ * scale,
-                              params_deriv);
-  }
-}
-
-LstmNonlinearityComponent::LstmNonlinearityComponent(
-    const LstmNonlinearityComponent &other):
-    UpdatableComponent(other),
-    params_(other.params_),
-    use_dropout_(other.use_dropout_),
-    value_sum_(other.value_sum_),
-    deriv_sum_(other.deriv_sum_),
-    self_repair_config_(other.self_repair_config_),
-    self_repair_total_(other.self_repair_total_),
-    count_(other.count_),
-    preconditioner_(other.preconditioner_) { }
-
-void LstmNonlinearityComponent::Init(
-    int32 cell_dim, bool use_dropout,
-    BaseFloat param_stddev,
-    BaseFloat tanh_self_repair_threshold,
-    BaseFloat sigmoid_self_repair_threshold,
-    BaseFloat self_repair_scale) {
-  KALDI_ASSERT(cell_dim > 0 && param_stddev >= 0.0 &&
-               tanh_self_repair_threshold >= 0.0 &&
-               tanh_self_repair_threshold <= 1.0 &&
-               sigmoid_self_repair_threshold >= 0.0 &&
-               sigmoid_self_repair_threshold <= 0.25 &&
-               self_repair_scale >= 0.0 && self_repair_scale <= 0.1);
-  use_dropout_ = use_dropout;
-  params_.Resize(3, cell_dim);
-  params_.SetRandn();
-  params_.Scale(param_stddev);
-  value_sum_.Resize(5, cell_dim);
-  deriv_sum_.Resize(5, cell_dim);
-  self_repair_config_.Resize(10);
-  self_repair_config_.Range(0, 5).Set(sigmoid_self_repair_threshold);
-  self_repair_config_(2) = tanh_self_repair_threshold;
-  self_repair_config_(4) = tanh_self_repair_threshold;
-  self_repair_config_.Range(5, 5).Set(self_repair_scale);
-  self_repair_total_.Resize(5);
-  count_ = 0.0;
-  InitNaturalGradient();
-
-}
-
-void LstmNonlinearityComponent::InitNaturalGradient() {
-  // As regards the configuration for the natural-gradient preconditioner, we
-  // don't make it configurable from the command line-- it's unlikely that any
-  // differences from changing this would be substantial enough to effectively
-  // tune the configuration.  Because the preconditioning code doesn't 'see' the
-  // derivatives from individual frames, but only averages over the minibatch,
-  // there is a fairly small amount of data available to estimate the Fisher
-  // information matrix, so we set the rank, update period and
-  // num-samples-history to smaller values than normal.
-  preconditioner_.SetRank(20);
-  preconditioner_.SetUpdatePeriod(2);
-  preconditioner_.SetNumSamplesHistory(1000.0);
-}
-
-/// virtual
-void LstmNonlinearityComponent::FreezeNaturalGradient(bool freeze) {
-  preconditioner_.Freeze(freeze);
-}
-
-void LstmNonlinearityComponent::InitFromConfig(ConfigLine *cfl) {
-  InitLearningRatesFromConfig(cfl);
-  bool ok = true;
-  bool use_dropout = false;
-  int32 cell_dim;
-  // these self-repair thresholds are the normal defaults for tanh and sigmoid
-  // respectively.  If, later on, we decide that we want to support different
-  // self-repair config values for the individual sigmoid and tanh
-  // nonlinearities, we can modify this code then.
-  BaseFloat tanh_self_repair_threshold = 0.2,
-      sigmoid_self_repair_threshold = 0.05,
-      self_repair_scale = 1.0e-05;
-  // param_stddev is the stddev of the parameters.  it may be better to
-  // use a smaller value but this was the default in the python scripts
-  // for a while.
-  BaseFloat param_stddev = 1.0;
-  ok = ok && cfl->GetValue("cell-dim", &cell_dim);
-  cfl->GetValue("param-stddev", &param_stddev);
-  cfl->GetValue("tanh-self-repair-threshold",
-                &tanh_self_repair_threshold);
-  cfl->GetValue("sigmoid-self-repair-threshold",
-                &sigmoid_self_repair_threshold);
-  cfl->GetValue("self-repair-scale", &self_repair_scale);
-  cfl->GetValue("use-dropout", &use_dropout);
-
-  // We may later on want to make it possible to initialize the different
-  // parameters w_ic, w_fc and w_oc with different biases.  We'll implement
-  // that when and if it's needed.
-
-  if (cfl->HasUnusedValues())
-    KALDI_ERR << "Could not process these elements in initializer: "
-              << cfl->UnusedValues();
-  if (ok) {
-    Init(cell_dim, use_dropout, param_stddev, tanh_self_repair_threshold,
-         sigmoid_self_repair_threshold, self_repair_scale);
-  } else {
-    KALDI_ERR << "Invalid initializer for layer of type "
-              << Type() << ": \"" << cfl->WholeLine() << "\"";
-  }
-}
-
-void LstmNonlinearityComponent::ConsolidateMemory() {
-  OnlineNaturalGradient preconditioner_temp(preconditioner_);
-  preconditioner_.Swap(&preconditioner_);
-}
 
 SumBlockComponent::SumBlockComponent(const SumBlockComponent &other):
     input_dim_(other.input_dim_), output_dim_(other.output_dim_),
diff --git a/src/nnet3/nnet-simple-component.h b/src/nnet3/nnet-simple-component.h
index 11c60f8f352..546176f71ee 100644
--- a/src/nnet3/nnet-simple-component.h
+++ b/src/nnet3/nnet-simple-component.h
@@ -1,9 +1,9 @@
 // nnet3/nnet-simple-component.h
 
 // Copyright 2011-2013  Karel Vesely
-//           2012-2015  Johns Hopkins University (author: Daniel Povey)
+//           2012-2017  Johns Hopkins University (author: Daniel Povey)
 //                2013  Xiaohui Zhang
-//           2014-2015  Vijayaditya Peddinti
+//           2014-2016  Vijayaditya Peddinti
 //           2014-2015  Guoguo Chen
 //                2015  Daniel Galvez
 //                2015  Tom Ko
@@ -42,7 +42,7 @@ namespace nnet3 {
 ///   nnet-general-component.h there are components that don't fit this pattern.
 ///
 ///   Some components that do provide the kSimpleComponent flag are not declared
-///   here: see also nnet-normalize-component.h.
+///   here: see also nnet-normalize-component.h and nnet-combined-component.h
 
 // This "nnet3" version of the p-norm component only supports the 2-norm.
 class PnormComponent: public Component {
@@ -756,7 +756,7 @@ class LogSoftmaxComponent: public NonlinearComponent {
   Configuration values accepted by this component:
 
   Values inherited from UpdatableComponent (see its declaration in
-  nnet-component-itf for details):
+  nnet-component-itf.h for details):
      learning-rate
      learning-rate-factor
      max-change
@@ -817,8 +817,8 @@ class LogSoftmaxComponent: public NonlinearComponent {
                             matrix in the input space.  default=20.
       rank-out              Rank used in low-rank-plus-unit estimate of Fisher
                             matrix in the output-derivative space.  default=80.
-      update-period         Determines after with what frequency (in
-                            minibatches) we update the Fisher-matrix estimates;
+      update-period         Determines the period (in minibatches) with which
+                            we update the Fisher-matrix estimates;
                             making this > 1 saves a little time in training.
                             default=4.
 */
@@ -1815,7 +1815,6 @@ class NaturalGradientPerElementScaleComponent: public PerElementScaleComponent {
 };
 
 
-
 /*
   ScaleAndOffsetComponent implements a per-element scale and offset.
   It may be useful just after BatchNormComponent, as the trainable offset
@@ -1947,523 +1946,6 @@ class ScaleAndOffsetComponent: public UpdatableComponent {
 };
 
 
-
-/**
- * WARNING, this component is deprecated in favor of
- *  TimeHeightConvolutionComponent, and will be deleted.
- * ConvolutionalComponent implements 2d-convolution.
- * It uses 3D filters on 3D inputs, but the 3D filters hop only over
- * 2 dimensions as it has same size as the input along the 3rd dimension.
- * Input : A matrix where each row is a  vectorized 3D-tensor.
- *        The 3D tensor has dimensions
- *        x: (e.g. time)
- *        y: (e.g. frequency)
- *        z: (e.g. channels like features/delta/delta-delta)
- *
- *        The component supports input vectorizations of type zyx and yzx.
- *        The default vectorization type is zyx.
- *        e.g. for input vectorization of type zyx the input is vectorized by
- *        spanning axes z, y and x of the tensor in that order.
- *        Given 3d tensor A with sizes (2, 2, 2) along the three dimensions
- *        the zyx vectorized input looks like
- *  A(0,0,0) A(0,0,1) A(0,1,0) A(0,1,1) A(1,0,0) A(1,0,1) A(1,1,0) A(1,1,1)
- *
- *
- * Output : The output is also a 3D tensor vectorized in the zyx format.
- *          The channel axis (z) in the output corresponds to the output of
- *          different filters. The first channel corresponds to the first filter
- *          i.e., first row of the filter_params_ matrix.
- *
- * Note: The component has to support yzx input vectorization as the binaries
- * like add-deltas generate yz vectorized output. These input vectors are
- * concatenated using the Append descriptor across time steps to form a yzx
- * vectorized 3D tensor input.
- * e.g. Append(Offset(input, -1), input, Offset(input, 1))
- *
- *
- * For information on the hyperparameters and parameters of this component see
- * the variable declarations.
- *
- * Propagation:
- * ------------
- * Convolution operation consists of a dot-products between the filter tensor
- * and input tensor patch, for various shifts of filter tensor along the x and y
- * axes input tensor. (Note: there is no shift along z-axis as the filter and
- * input tensor have same size along this axis).
- *
- * For a particular shift (i,j) of the filter tensor
- * along input tensor dimensions x and y, the elements of the input tensor which
- * overlap with the filter form the input tensor patch. This patch is vectorized
- * in zyx format. All the patches corresponding to various samples in the
- * mini-batch are stacked into a matrix, where each row corresponds to one
- * patch. Let this matrix be represented by X_{i,j}. The dot products with
- * various filters are computed simultaneously by computing the matrix product
- * with the filter_params_ matrix (W)
- * Y_{i,j} = X_{i,j}*W^T.
- * Each row of W corresponds to one filter 3D tensor vectorized in zyx format.
- *
- * All the matrix products corresponding to various shifts (i,j) of the
- * filter tensor are computed simultaneously using the AddMatMatBatched
- * call of CuMatrixBase class.
- *
- * BackPropagation:
- * ----------------
- *  Backpropagation to compute the input derivative (\nabla X_{i,j})
- *  consists of the a series of matrix products.
- *  \nablaX_{i,j} = \nablaY_{i,j}*W where \nablaY_{i,j} corresponds to the
- *   output derivative for a particular shift of the filter.
- *
- *   Once again these matrix products are computed simultaneously.
- *
- * Update:
- * -------
- *  The weight gradient is computed as
- *  \nablaW = \Sum_{i,j} (X_{i,j}^T *\nablaY_{i,j})
- *
- */
-class ConvolutionComponent: public UpdatableComponent {
- public:
-  enum TensorVectorizationType  {
-    kYzx = 0,
-    kZyx = 1
-  };
-
-  ConvolutionComponent();
-  // constructor using another component
-  ConvolutionComponent(const ConvolutionComponent &component);
-  // constructor using parameters
-  ConvolutionComponent(
-    const CuMatrixBase<BaseFloat> &filter_params,
-    const CuVectorBase<BaseFloat> &bias_params,
-    int32 input_x_dim, int32 input_y_dim, int32 input_z_dim,
-    int32 filt_x_dim, int32 filt_y_dim,
-    int32 filt_x_step, int32 filt_y_step,
-    TensorVectorizationType input_vectorization,
-    BaseFloat learning_rate);
-
-  virtual int32 InputDim() const;
-  virtual int32 OutputDim() const;
-
-  virtual std::string Info() const;
-  virtual void InitFromConfig(ConfigLine *cfl);
-  virtual std::string Type() const { return "ConvolutionComponent"; }
-  virtual int32 Properties() const {
-    return kSimpleComponent|kUpdatableComponent|kBackpropNeedsInput|
-           kBackpropAdds|kPropagateAdds;
-  }
-
-  virtual void* Propagate(const ComponentPrecomputedIndexes *indexes,
-                         const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const;
-  virtual void Backprop(const std::string &debug_info,
-                        const ComponentPrecomputedIndexes *indexes,
-                        const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &, // out_value,
-                        const CuMatrixBase<BaseFloat> &out_deriv,
-                        void *memo,
-                        Component *to_update_in,
-                        CuMatrixBase<BaseFloat> *in_deriv) const;
-  void Update(const std::string &debug_info,
-              const CuMatrixBase<BaseFloat> &in_value,
-              const CuMatrixBase<BaseFloat> &out_deriv,
-              const std::vector<CuSubMatrix<BaseFloat> *>& out_deriv_batch);
-
-
-  virtual void Read(std::istream &is, bool binary);
-  virtual void Write(std::ostream &os, bool binary) const;
-
-  virtual Component* Copy() const;
-
-  // Some functions from base-class UpdatableComponent.
-  virtual void Scale(BaseFloat scale);
-  virtual void Add(BaseFloat alpha, const Component &other);
-  virtual void PerturbParams(BaseFloat stddev);
-  virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
-  virtual int32 NumParameters() const;
-  virtual void Vectorize(VectorBase<BaseFloat> *params) const;
-  virtual void UnVectorize(const VectorBase<BaseFloat> &params);
-
-  // Some functions that are specific to this class.
-  void SetParams(const VectorBase<BaseFloat> &bias,
-                 const MatrixBase<BaseFloat> &filter);
-  const CuVector<BaseFloat> &BiasParams() const { return bias_params_; }
-  const CuMatrix<BaseFloat> &LinearParams() const { return filter_params_; }
-  void Init(int32 input_x_dim, int32 input_y_dim, int32 input_z_dim,
-            int32 filt_x_dim, int32 filt_y_dim,
-            int32 filt_x_step, int32 filt_y_step, int32 num_filters,
-            TensorVectorizationType input_vectorization,
-            BaseFloat param_stddev, BaseFloat bias_stddev);
-  // there is no filt_z_dim parameter as the length of the filter along
-  // z-dimension is same as the input
-  void Init(int32 input_x_dim, int32 input_y_dim, int32 input_z_dim,
-            int32 filt_x_dim, int32 filt_y_dim,
-            int32 filt_x_step, int32 filt_y_step,
-            TensorVectorizationType input_vectorization,
-            std::string matrix_filename);
-
-  // resize the component, setting the parameters to zero, while
-  // leaving any other configuration values the same
-  void Resize(int32 input_dim, int32 output_dim);
-
-  void Update(const std::string &debug_info,
-              const CuMatrixBase<BaseFloat> &in_value,
-              const CuMatrixBase<BaseFloat> &out_deriv);
-
-
- private:
-  int32 input_x_dim_;   // size of the input along x-axis
-                        // (e.g. number of time steps)
-
-  int32 input_y_dim_;   // size of input along y-axis
-                        // (e.g. number of mel-frequency bins)
-
-  int32 input_z_dim_;   // size of input along z-axis
-                        // (e.g. number of channels is 3 if the input has
-                        // features + delta + delta-delta features
-
-  int32 filt_x_dim_;    // size of the filter along x-axis
-
-  int32 filt_y_dim_;    // size of the filter along y-axis
-
-  // there is no filt_z_dim_ as it is always assumed to be
-  // the same as input_z_dim_
-
-  int32 filt_x_step_;   // the number of steps taken along x-axis of input
-                        //  before computing the next dot-product
-                        //  of filter and input
-
-  int32 filt_y_step_;   // the number of steps taken along y-axis of input
-                        // before computing the next dot-product of the filter
-                        // and input
-
-  // there is no filt_z_step_ as only dot product is possible along this axis
-
-  TensorVectorizationType input_vectorization_; // type of vectorization of the
-  // input 3D tensor. Accepts zyx and yzx formats
-
-  CuMatrix<BaseFloat> filter_params_;
-  // the filter (or kernel) matrix is a matrix of vectorized 3D filters
-  // where each row in the matrix corresponds to one filter.
-  // The 3D filter tensor is vectorizedin zyx format.
-  // The first row of the matrix corresponds to the first filter and so on.
-  // Keep in mind the vectorization type and order of filters when using file
-  // based initialization.
-
-  CuVector<BaseFloat> bias_params_;
-  // the filter-specific bias vector (i.e., there is a seperate bias added
-  // to the output of each filter).
-
-  void InputToInputPatches(const CuMatrixBase<BaseFloat>& in,
-                           CuMatrix<BaseFloat> *patches) const;
-  void InderivPatchesToInderiv(const CuMatrix<BaseFloat>& in_deriv_patches,
-                               CuMatrixBase<BaseFloat> *in_deriv) const;
-  const ConvolutionComponent &operator = (const ConvolutionComponent &other); // Disallow.
-};
-
-
-/*
-  LstmNonlinearityComponent is a component that implements part of an LSTM, by
-  combining together the sigmoids and tanh's, plus some diagonal terms, into
-  a single block.
-  We will refer to the LSTM formulation used in
-
-  Long Short-Term Memory Recurrent Neural Network Architectures for Large Scale Acoustic Modeling"
-  by H. Sak et al,
-  http://static.googleusercontent.com/media/research.google.com/en//pubs/archive/43905.pdf.
-
-  Suppose the cell dimension is C.  Then outside this component, we compute
-  the 4 * C-dimensional quantity consisting of 4 blocks as follows, by a single
-  matrix multiplication:
-
-  i_part = W_{ix} x_t + W_{im} m_{t-1} + b_i
-  f_part = W_{fx} x_t + W_{fm} m_{t-1} + b_f
-  c_part = W_{cx} x_t + W_{cm} m_{t-1} + b_c
-  o_part = W_{cx} x_t + W_{om} m_{t-1} + b_o
-
-  The part of the computation that takes place in this component is as follows.
-  Its input is of dimension 5C [however, search for 'dropout' below],
-  consisting of 5 blocks: (i_part, f_part, c_part, o_part, and c_{t-1}).  Its
-  output is of dimension 2C, consisting of 2 blocks: c_t and m_t.
-
-  To recap: the input is (i_part, f_part, c_part, o_part, c_{t-1}); the output is (c_t, m_t).
-
-  This component has parameters, 3C of them in total: the diagonal matrices w_i, w_f
-  and w_o.
-
-
-  In the forward pass (Propagate), this component computes the following:
-
-     i_t = Sigmoid(i_part + w_{ic}*c_{t-1})   (1)
-     f_t = Sigmoid(f_part + w_{fc}*c_{t-1})   (2)
-     c_t = f_t*c_{t-1} + i_t * Tanh(c_part)   (3)
-     o_t = Sigmoid(o_part + w_{oc}*c_t)       (4)
-     m_t = o_t * Tanh(c_t)                    (5)
-    # note: the outputs are just c_t and m_t.
-
-  [Note regarding dropout: optionally the input-dimension may be 5C + 3 instead
-  of 5C in this case, the last three input dimensions will be interpreted as
-  per-frame dropout masks on i_t, f_t and o_t respectively, so that on the RHS of
-  (3), i_t is replaced by i_t * i_t_scale, and likewise for f_t and o_t.]
-
-  The backprop is as you would think, but for the "self-repair" we need to pass
-  in additional vectors (of the same dim as the parameters of the layer) that
-  dictate whether or not we add an additional term to the backpropagated
-  derivatives.  (This term helps force the input to the nonlinearities into the
-  range where the derivatives are not too small).
-
-  This component stores stats of the same form as are normally stored by the
-  StoreStats() functions for the sigmoid and tanh units, i.e. averages of the
-  activations and derivatives, but this is done inside the Backprop() functions.
-  [the StoreStats() functions don't take the input data as an argument, so
-  storing this data that way is impossible, and anyway it's more efficient to
-  do it as part of backprop.]
-
-  Configuration values accepted:
-         cell-dim          e.g. cell-dim=1024  Cell dimension.  The input
-                          dimension of this component is cell-dim * 5, and the
-                          output dimension is cell-dim * 2.  Note: this
-                          component implements only part of the LSTM layer,
-                          see comments above.
-         param-stddev     Standard deviation for random initialization of
-                          the diagonal matrices (AKA peephole connections).
-                          default=1.0, which is probably too high but
-                          we couldn't see any reliable gain from decreasing it.
-         tanh-self-repair-threshold   Equivalent to the self-repair-lower-threshold
-                          in a TanhComponent; applies to both the tanh nonlinearities.
-                          default=0.2, you probably won't want to changethis.
-         sigmoid-self-repair-threshold   Equivalent to self-repair-lower-threshold
-                          in a SigmoidComponent; applies to all three of the sigmoid
-                          nonlinearities.  default=0.05, you probably won't want to
-                          change this.
-         self-repair-scale Equivalent to the self-repair-scale in a SigmoidComponent
-                          or TanhComponent; applies to both the sigmoid and tanh
-                          nonlinearities.  default=1.0e-05, which you probably won't
-                          want to change unless dealing with an objective function
-                          that has smaller or larger dynamic range than normal, in
-                          which case you might want to make it smaller or larger.
-*/
-class LstmNonlinearityComponent: public UpdatableComponent {
- public:
-
-  virtual int32 InputDim() const;
-  virtual int32 OutputDim() const;
-  virtual std::string Info() const;
-  virtual void InitFromConfig(ConfigLine *cfl);
-  LstmNonlinearityComponent(): use_dropout_(false) { }
-  virtual std::string Type() const { return "LstmNonlinearityComponent"; }
-  virtual int32 Properties() const {
-    return kSimpleComponent|kUpdatableComponent|kBackpropNeedsInput;
-  }
-
-  virtual void* Propagate(const ComponentPrecomputedIndexes *indexes,
-                         const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const;
-  virtual void Backprop(const std::string &debug_info,
-                        const ComponentPrecomputedIndexes *indexes,
-                        const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &, // out_value,
-                        const CuMatrixBase<BaseFloat> &out_deriv,
-                        void *memo,
-                        Component *to_update_in,
-                        CuMatrixBase<BaseFloat> *in_deriv) const;
-
-  virtual void Read(std::istream &is, bool binary);
-  virtual void Write(std::ostream &os, bool binary) const;
-
-  virtual Component* Copy() const;
-
-  // Some functions from base-class UpdatableComponent.
-  virtual void Scale(BaseFloat scale);
-  virtual void Add(BaseFloat alpha, const Component &other);
-  virtual void PerturbParams(BaseFloat stddev);
-  virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
-  virtual int32 NumParameters() const;
-  virtual void Vectorize(VectorBase<BaseFloat> *params) const;
-  virtual void UnVectorize(const VectorBase<BaseFloat> &params);
-  virtual void ZeroStats();
-  virtual void FreezeNaturalGradient(bool freeze);
-
-  // Some functions that are specific to this class:
-  explicit LstmNonlinearityComponent(
-      const LstmNonlinearityComponent &other);
-
-  void Init(int32 cell_dim, bool use_dropout,
-            BaseFloat param_stddev,
-            BaseFloat tanh_self_repair_threshold,
-            BaseFloat sigmoid_self_repair_threshold,
-            BaseFloat self_repair_scale);
-
-  virtual void ConsolidateMemory();
-
- private:
-
-  // Initializes the natural-gradient object with the configuration we
-  // use for this object, which for now is hardcoded at the C++ level.
-  void InitNaturalGradient();
-
-  // Notation: C is the cell dimension; it equals params_.NumCols().
-
-  // The dimension of the parameter matrix is (3 x C);
-  // it contains the 3 diagonal parameter matrices w_i, w_f and w_o.
-  CuMatrix<BaseFloat> params_;
-
-  // If true, we expect an extra 3 dimensions on the input, for dropout masks
-  // for i_t and f_t.
-  bool use_dropout_;
-
-  // Of dimension 5 * C, with a row for each of the Sigmoid/Tanh functions in
-  // equations (1) through (5), this is the sum of the values of the nonliearities
-  // (used for diagnostics only).  It is comparable to value_sum_ vector
-  // in base-class NonlinearComponent.
-  CuMatrix<double> value_sum_;
-
-  // Of dimension 5 * C, with a row for each of the Sigmoid/Tanh functions in
-  // equations (1) through (5), this is the sum of the derivatives of the
-  // nonliearities (used for diagnostics and to control self-repair).  It is
-  // comparable to the deriv_sum_ vector in base-class
-  // NonlinearComponent.
-  CuMatrix<double> deriv_sum_;
-
-  // This matrix has dimension 10.  The contents are a block of 5 self-repair
-  // thresholds (typically "0.05 0.05 0.2 0.05 0.2"), then a block of 5
-  // self-repair scales (typically all 0.00001).  These are for each of the 5
-  // nonlinearities in the LSTM component in turn (see comments in cu-math.h for
-  // more info).
-  CuVector<BaseFloat> self_repair_config_;
-
-  // This matrix has dimension 5.  For each of the 5 nonlinearities in the LSTM
-  // component (see comments in cu-math.h for more info), it contains the total,
-  // over all frames represented in count_, of the number of dimensions that
-  // were subject to self_repair.  To get the self-repair proportion you should
-  // divide by (count_ times cell_dim_).
-  CuVector<double> self_repair_total_;
-
-  // The total count (number of frames) corresponding to the stats in value_sum_
-  // and deriv_sum_.
-  double count_;
-
-  // Preconditioner for the parameters of this component [operates in the space
-  // of dimension C].
-  // The preconditioner stores its own configuration values; we write and read
-  // these, but not the preconditioner object itself.
-  OnlineNaturalGradient preconditioner_;
-
-  const LstmNonlinearityComponent &operator
-      = (const LstmNonlinearityComponent &other); // Disallow.
-};
-
-
-
-
-/*
- * WARNING, this component is deprecated as it's not compatible with
- *   TimeHeightConvolutionComponent, and it will eventually be deleted.
- * MaxPoolingComponent :
- * Maxpooling component was firstly used in ConvNet for selecting an
- * representative activation in an area. It inspired Maxout nonlinearity.
- * Each output element of this component is the maximum of a block of
- * input elements where the block has a 3D dimension (pool_x_size_,
- * pool_y_size_, pool_z_size_).
- * Blocks could overlap if the shift value on any axis is smaller
- * than its corresponding pool size (e.g. pool_x_step_ < pool_x_size_).
- * If the shift values are euqal to their pool size, there is no
- * overlap; while if they all equal 1, the blocks overlap to
- * the greatest possible extent.
- *
- * This component is designed to be used after a ConvolutionComponent
- * so that the input matrix is propagated from a 2d-convolutional layer.
- * This component implements 3d-maxpooling which performs
- * max pooling along the three axes.
- * Input : A matrix where each row is a vectorized 3D-tensor.
- *        The 3D tensor has dimensions
- *        x: (e.g. time)
- *        y: (e.g. frequency)
- *        z: (e.g. channels like number of filters in the ConvolutionComponent)
- *
- *        The component assumes input vectorizations of type zyx
- *        which is the default output vectorization type of a ConvolutionComponent.
- *        e.g. for input vectorization of type zyx the input is vectorized by
- *        spanning axes z, y and x of the tensor in that order.
- *        Given 3d tensor A with sizes (2, 2, 2) along the three dimensions
- *        the zyx vectorized input looks like
- *  A(0,0,0) A(0,0,1) A(0,1,0) A(0,1,1) A(1,0,0) A(1,0,1) A(1,1,0) A(1,1,1)
- *
- * Output : The output is also a 3D tensor vectorized in the zyx format.
- *
- * For information on the hyperparameters and parameters of this component see
- * the variable declarations.
- *
- *
- */
-class MaxpoolingComponent: public Component {
- public:
-
-  MaxpoolingComponent(): input_x_dim_(0), input_y_dim_(0), input_z_dim_(0),
-                           pool_x_size_(0), pool_y_size_(0), pool_z_size_(0),
-                           pool_x_step_(0), pool_y_step_(0), pool_z_step_(0) { }
-  // constructor using another component
-  MaxpoolingComponent(const MaxpoolingComponent &component);
-
-  virtual int32 InputDim() const;
-  virtual int32 OutputDim() const;
-
-  virtual std::string Info() const;
-  virtual void InitFromConfig(ConfigLine *cfl);
-  virtual std::string Type() const { return "MaxpoolingComponent"; }
-  virtual int32 Properties() const {
-    return kSimpleComponent|kBackpropNeedsInput|kBackpropNeedsOutput|
-           kBackpropAdds;
-  }
-
-  virtual void* Propagate(const ComponentPrecomputedIndexes *indexes,
-                         const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const;
-  virtual void Backprop(const std::string &debug_info,
-                        const ComponentPrecomputedIndexes *indexes,
-                        const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &out_value,
-                        const CuMatrixBase<BaseFloat> &out_deriv,
-                        void *memo,
-                        Component *, // to_update,
-                        CuMatrixBase<BaseFloat> *in_deriv) const;
-
-  virtual void Read(std::istream &is, bool binary); // This Read function
-  // requires that the Component has the correct type.
-
-  /// Write component to stream
-  virtual void Write(std::ostream &os, bool binary) const;
-  virtual Component* Copy() const { return new MaxpoolingComponent(*this); }
-
-
- protected:
-  void InputToInputPatches(const CuMatrixBase<BaseFloat>& in,
-                           CuMatrix<BaseFloat> *patches) const;
-  void InderivPatchesToInderiv(const CuMatrix<BaseFloat>& in_deriv_patches,
-                               CuMatrixBase<BaseFloat> *in_deriv) const;
-  virtual void Check() const;
-
-
-  int32 input_x_dim_;   // size of the input along x-axis
-  // (e.g. number of time steps)
-  int32 input_y_dim_;   // size of input along y-axis
-  // (e.g. number of mel-frequency bins)
-  int32 input_z_dim_;   // size of input along z-axis
-  // (e.g. number of filters in the ConvolutionComponent)
-
-  int32 pool_x_size_;    // size of the pooling window along x-axis
-  int32 pool_y_size_;    // size of the pooling window along y-axis
-  int32 pool_z_size_;    // size of the pooling window along z-axis
-
-  int32 pool_x_step_;   // the number of steps taken along x-axis of input
-  //  before computing the next pool
-  int32 pool_y_step_;   // the number of steps taken along y-axis of input
-  // before computing the next pool
-  int32 pool_z_step_;   // the number of steps taken along z-axis of input
-  // before computing the next pool
-
-};
-
-
 /**
    CompositeComponent is a component representing a sequence of
    [simple] components.  The config line would be something like the following
diff --git a/src/nnet3/nnet-test-utils.cc b/src/nnet3/nnet-test-utils.cc
index bae332cd584..a8ef30bc314 100644
--- a/src/nnet3/nnet-test-utils.cc
+++ b/src/nnet3/nnet-test-utils.cc
@@ -1400,7 +1400,7 @@ void ComputeExampleComputationRequestSimple(
 static void GenerateRandomComponentConfig(std::string *component_type,
                                           std::string *config) {
 
-  int32 n = RandInt(0, 35);
+  int32 n = RandInt(0, 37);
   BaseFloat learning_rate = 0.001 * RandInt(1, 100);
 
   std::ostringstream os;
@@ -1757,6 +1757,22 @@ static void GenerateRandomComponentConfig(std::string *component_type,
          << " use-bias=" << (RandInt(0,1) == 0 ? "true":"false");
       break;
     }
+    case 36: {
+      *component_type = "GruNonlinearityComponent";
+      int32 cell_dim = RandInt(10, 20);
+      int32 recurrent_dim = (RandInt(0, 1) == 0 ?
+                             RandInt(5, cell_dim - 1) : cell_dim);
+      os << "cell-dim=" << cell_dim
+         << " recurrent-dim=" << recurrent_dim;
+      break;
+    }
+    case 37: {
+      *component_type = "OutputGruNonlinearityComponent";
+      os << "cell-dim=" << RandInt(10, 20)
+         << " learning-rate=" << learning_rate;
+
+      break;
+    }
     default:
       KALDI_ERR << "Error generating random component";
   }